Commit b411b363 authored by Philipp Reisner's avatar Philipp Reisner Committed by Jens Axboe

The DRBD driver

Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
parent 1a35e0f6
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
version="1.0"
width="210mm"
height="297mm"
viewBox="0 0 21000 29700"
id="svg2"
style="fill-rule:evenodd">
<defs
id="defs4" />
<g
id="Default"
style="visibility:visible">
<desc
id="desc180">Master slide</desc>
</g>
<path
d="M 11999,8601 L 11899,8301 L 12099,8301 L 11999,8601 z"
id="path193"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,7801 L 11999,8361"
id="path197"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 7999,10401 L 7899,10101 L 8099,10101 L 7999,10401 z"
id="path209"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,9601 L 7999,10161"
id="path213"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 11999,7801 L 11685,7840 L 11724,7644 L 11999,7801 z"
id="path225"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,7001 L 11764,7754"
id="path229"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-1244.4792,1416.5139)"
id="g245"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text247">
<tspan
x="9139 9368 9579 9808 9986 10075 10252 10481 10659 10837 10909"
y="9284"
id="tspan249">RSDataReply</tspan>
</text>
</g>
<path
d="M 7999,9601 L 8281,9458 L 8311,9655 L 7999,9601 z"
id="path259"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,9001 L 8236,9565"
id="path263"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,1620.9382,-1639.4947)"
id="g279"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text281">
<tspan
x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
y="7023"
id="tspan283">CsumRSRequest</tspan>
</text>
</g>
<text
id="text297"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
y="5707"
id="tspan299">w_make_resync_request()</tspan>
</text>
<text
id="text313"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
y="7806"
id="tspan315">receive_DataRequest()</tspan>
</text>
<text
id="text329"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
y="8606"
id="tspan331">drbd_endio_read_sec()</tspan>
</text>
<text
id="text345"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
y="9007"
id="tspan347">w_e_end_csum_rs_req()</tspan>
</text>
<text
id="text361"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
y="9507"
id="tspan363">receive_RSDataReply()</tspan>
</text>
<text
id="text377"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
y="10407"
id="tspan379">drbd_endio_write_sec()</tspan>
</text>
<text
id="text393"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
y="10907"
id="tspan395">e_end_resync_block()</tspan>
</text>
<path
d="M 11999,11601 L 11685,11640 L 11724,11444 L 11999,11601 z"
id="path405"
style="fill:#000080;visibility:visible" />
<path
d="M 7999,10801 L 11764,11554"
id="path409"
style="fill:none;stroke:#000080;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,2434.7562,-1674.649)"
id="g425"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<text
id="text427">
<tspan
x="9320 9621 9726 9798 9887 10065 10277 10438"
y="10943"
id="tspan429">WriteAck</tspan>
</text>
</g>
<text
id="text443"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
y="11559"
id="tspan445">got_BlockAck()</tspan>
</text>
<text
id="text459"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14302 14540 14658 14777 14870 15107 15225 15437 15649 15886"
y="4877"
id="tspan461">Checksum based Resync, case not in sync</tspan>
</text>
<text
id="text475"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="6961 7266 7571 7854 8159 8299 8536 8654 8891 9010 9247 9484 9603 9840 9958 10077 10170 10407"
y="2806"
id="tspan477">DRBD-8.3 data flow</tspan>
</text>
<text
id="text491"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
y="7005"
id="tspan493">w_e_send_csum()</tspan>
</text>
<path
d="M 11999,17601 L 11899,17301 L 12099,17301 L 11999,17601 z"
id="path503"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,16801 L 11999,17361"
id="path507"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 11999,16801 L 11685,16840 L 11724,16644 L 11999,16801 z"
id="path519"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,16001 L 11764,16754"
id="path523"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-2539.5806,1529.3491)"
id="g539"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<text
id="text541">
<tspan
x="9269 9498 9709 9798 9959 10048 10226 10437 10598 10776"
y="18265"
id="tspan543">RSIsInSync</tspan>
</text>
</g>
<path
d="M 7999,18601 L 8281,18458 L 8311,18655 L 7999,18601 z"
id="path553"
style="fill:#000080;visibility:visible" />
<path
d="M 11999,18001 L 8236,18565"
id="path557"
style="fill:none;stroke:#000080;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,3461.4027,-1449.3012)"
id="g573"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text575">
<tspan
x="8743 8972 9132 9310 9573 9801 10013 10242 10419 10597 10775 10953 11114"
y="16023"
id="tspan577">CsumRSRequest</tspan>
</text>
</g>
<text
id="text591"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
y="16806"
id="tspan593">receive_DataRequest()</tspan>
</text>
<text
id="text607"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
y="17606"
id="tspan609">drbd_endio_read_sec()</tspan>
</text>
<text
id="text623"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13825 13986 14164 14426 14604 14710 14871 15049 15154 15332 15510 15616"
y="18007"
id="tspan625">w_e_end_csum_rs_req()</tspan>
</text>
<text
id="text639"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5735 5913 6091 6180 6357 6446 6607 6696 6874 7085 7246 7424 7585 7691"
y="18507"
id="tspan641">got_IsInSync()</tspan>
</text>
<text
id="text655"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7999 8304 8541 8778 8990 9201 9413 9650 10001 10120 10357 10594 10806 11043 11280 11398 11703 11940 12152 12364 12601 12812 12931 13049 13261 13498 13710 13947 14065 14159 14396 14514 14726 14937 15175"
y="13877"
id="tspan657">Checksum based Resync, case in sync</tspan>
</text>
<path
d="M 12000,24601 L 11900,24301 L 12100,24301 L 12000,24601 z"
id="path667"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,23801 L 12000,24361"
id="path671"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 8000,26401 L 7900,26101 L 8100,26101 L 8000,26401 z"
id="path683"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,25601 L 8000,26161"
id="path687"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 12000,23801 L 11686,23840 L 11725,23644 L 12000,23801 z"
id="path699"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,23001 L 11765,23754"
id="path703"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,-3543.8452,1630.5143)"
id="g719"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text721">
<tspan
x="9464 9710 9921 10150 10328 10505 10577"
y="25236"
id="tspan723">OVReply</tspan>
</text>
</g>
<path
d="M 8000,25601 L 8282,25458 L 8312,25655 L 8000,25601 z"
id="path733"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,25001 L 8237,25565"
id="path737"
style="fill:none;stroke:#008000;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,4918.2801,-1381.2128)"
id="g753"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<text
id="text755">
<tspan
x="9142 9388 9599 9828 10006 10183 10361 10539 10700"
y="23106"
id="tspan757">OVRequest</tspan>
</text>
</g>
<text
id="text771"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13656 13868 14097 14274 14452 14630 14808 14969 15058 15163"
y="23806"
id="tspan773">receive_OVRequest()</tspan>
</text>
<text
id="text787"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
y="24606"
id="tspan789">drbd_endio_read_sec()</tspan>
</text>
<text
id="text803"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14004 14182 14288 14465 14643 14749"
y="25007"
id="tspan805">w_e_end_ov_req()</tspan>
</text>
<text
id="text819"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5101 5207 5385 5546 5723 5795 5956 6134 6312 6557 6769 6998 7175 7353 7425 7586 7692"
y="25507"
id="tspan821">receive_OVReply()</tspan>
</text>
<text
id="text835"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
y="26407"
id="tspan837">drbd_endio_read_sec()</tspan>
</text>
<text
id="text851"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4902 5131 5308 5486 5664 5842 6020 6197 6375 6553 6714 6892 6998 7175 7353 7425 7586 7692"
y="26907"
id="tspan853">w_e_end_ov_reply()</tspan>
</text>
<path
d="M 12000,27601 L 11686,27640 L 11725,27444 L 12000,27601 z"
id="path863"
style="fill:#000080;visibility:visible" />
<path
d="M 8000,26801 L 11765,27554"
id="path867"
style="fill:none;stroke:#000080;visibility:visible" />
<g
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,5704.1907,-1328.312)"
id="g883"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<text
id="text885">
<tspan
x="9279 9525 9736 9965 10143 10303 10481 10553"
y="26935"
id="tspan887">OVResult</tspan>
</text>
</g>
<text
id="text901"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12556 12645 12822 13068 13280 13508 13686 13847 14025 14097 14185 14291"
y="27559"
id="tspan903">got_OVResult()</tspan>
</text>
<text
id="text917"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8000 8330 8567 8660 8754 8991 9228 9346 9558 9795 9935 10028 10146"
y="21877"
id="tspan919">Online verify</tspan>
</text>
<text
id="text933"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4641 4870 5047 5310 5488 5649 5826 6004 6182 6343 6521 6626 6804 6982 7160 7338 7499 7587 7693"
y="23005"
id="tspan935">w_make_ov_request()</tspan>
</text>
<path
d="M 8000,6500 L 7900,6200 L 8100,6200 L 8000,6500 z"
id="path945"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,5700 L 8000,6260"
id="path949"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 3900,5500 L 3700,5500 L 3700,11000 L 3900,11000"
id="path961"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 3900,14500 L 3700,14500 L 3700,18600 L 3900,18600"
id="path973"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 3900,22800 L 3700,22800 L 3700,26900 L 3900,26900"
id="path985"
style="fill:none;stroke:#000000;visibility:visible" />
<text
id="text1001"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
y="6506"
id="tspan1003">drbd_endio_read_sec()</tspan>
</text>
<text
id="text1017"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
y="14708"
id="tspan1019">w_make_resync_request()</tspan>
</text>
<text
id="text1033"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5190 5419 5596 5774 5952 6113 6291 6468 6646 6824 6985 7146 7324 7586 7692"
y="16006"
id="tspan1035">w_e_send_csum()</tspan>
</text>
<path
d="M 8000,15501 L 7900,15201 L 8100,15201 L 8000,15501 z"
id="path1045"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,14701 L 8000,15261"
id="path1049"
style="fill:none;stroke:#008000;visibility:visible" />
<text
id="text1065"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4492 4670 4776 4953 5131 5309 5487 5665 5842 5914 6092 6270 6376 6554 6731 6909 7087 7248 7426 7587 7692"
y="15507"
id="tspan1067">drbd_endio_read_sec()</tspan>
</text>
<path
d="M 16100,9000 L 16300,9000 L 16300,7500 L 16100,7500"
id="path1077"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 16100,18000 L 16300,18000 L 16300,16500 L 16100,16500"
id="path1089"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 16100,25000 L 16300,25000 L 16300,23500 L 16100,23500"
id="path1101"
style="fill:none;stroke:#000000;visibility:visible" />
<text
id="text1117"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
y="5402"
id="tspan1119">rs_begin_io()</tspan>
</text>
<text
id="text1133"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2027 2133 2294 2472 2649 2827 3005 3077 3255 3432 3504 3682 3788"
y="14402"
id="tspan1135">rs_begin_io()</tspan>
</text>
<text
id="text1149"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2026 2132 2293 2471 2648 2826 3004 3076 3254 3431 3503 3681 3787"
y="22602"
id="tspan1151">rs_begin_io()</tspan>
</text>
<text
id="text1165"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1426 1532 1693 1871 2031 2209 2472 2649 2721 2899 2988 3166 3344 3416 3593 3699"
y="11302"
id="tspan1167">rs_complete_io()</tspan>
</text>
<text
id="text1181"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
y="18931"
id="tspan1183">rs_complete_io()</tspan>
</text>
<text
id="text1197"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1526 1632 1793 1971 2131 2309 2572 2749 2821 2999 3088 3266 3444 3516 3693 3799"
y="27231"
id="tspan1199">rs_complete_io()</tspan>
</text>
<text
id="text1213"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16126 16232 16393 16571 16748 16926 17104 17176 17354 17531 17603 17781 17887"
y="7402"
id="tspan1215">rs_begin_io()</tspan>
</text>
<text
id="text1229"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
y="16331"
id="tspan1231">rs_begin_io()</tspan>
</text>
<text
id="text1245"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16127 16233 16394 16572 16749 16927 17105 17177 17355 17532 17604 17782 17888"
y="23302"
id="tspan1247">rs_begin_io()</tspan>
</text>
<text
id="text1261"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
y="9302"
id="tspan1263">rs_complete_io()</tspan>
</text>
<text
id="text1277"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
y="18331"
id="tspan1279">rs_complete_io()</tspan>
</text>
<text
id="text1293"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16126 16232 16393 16571 16731 16909 17172 17349 17421 17599 17688 17866 18044 18116 18293 18399"
y="25302"
id="tspan1295">rs_complete_io()</tspan>
</text>
</svg>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg"
version="1.0"
width="210mm"
height="297mm"
viewBox="0 0 21000 29700"
id="svg2"
style="fill-rule:evenodd">
<defs
id="defs4" />
<g
id="Default"
style="visibility:visible">
<desc
id="desc176">Master slide</desc>
</g>
<path
d="M 11999,19601 L 11899,19301 L 12099,19301 L 11999,19601 z"
id="path189"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,18801 L 11999,19361"
id="path193"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 7999,21401 L 7899,21101 L 8099,21101 L 7999,21401 z"
id="path205"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,20601 L 7999,21161"
id="path209"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 11999,18801 L 11685,18840 L 11724,18644 L 11999,18801 z"
id="path221"
style="fill:#008000;visibility:visible" />
<path
d="M 7999,18001 L 11764,18754"
id="path225"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="-3023.845"
y="1106.8124"
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
id="text243"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="6115.1553 6344.1553 6555.1553 6784.1553 6962.1553 7051.1553 7228.1553 7457.1553 7635.1553 7813.1553 7885.1553"
y="21390.812"
id="tspan245">RSDataReply</tspan>
</text>
<path
d="M 7999,20601 L 8281,20458 L 8311,20655 L 7999,20601 z"
id="path255"
style="fill:#008000;visibility:visible" />
<path
d="M 11999,20001 L 8236,20565"
id="path259"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="3502.5356"
y="-2184.6621"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text277"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12321.536 12550.536 12761.536 12990.536 13168.536 13257.536 13434.536 13663.536 13841.536 14019.536 14196.536 14374.536 14535.536"
y="15854.338"
id="tspan279">RSDataRequest</tspan>
</text>
<text
id="text293"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4034 4263 4440 4703 4881 5042 5219 5397 5503 5681 5842 6003 6180 6341 6519 6625 6803 6980 7158 7336 7497 7586 7692"
y="17807"
id="tspan295">w_make_resync_request()</tspan>
</text>
<text
id="text309"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12305 12483 12644 12821 12893 13054 13232 13410 13638 13816 13905 14083 14311 14489 14667 14845 15023 15184 15272 15378"
y="18806"
id="tspan311">receive_DataRequest()</tspan>
</text>
<text
id="text325"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12483 12660 12838 13016 13194 13372 13549 13621 13799 13977 14083 14261 14438 14616 14794 14955 15133 15294 15399"
y="19606"
id="tspan327">drbd_endio_read_sec()</tspan>
</text>
<text
id="text341"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12191 12420 12597 12775 12953 13131 13309 13486 13664 13770 13931 14109 14287 14375 14553 14731 14837 15015 15192 15298"
y="20007"
id="tspan343">w_e_end_rsdata_req()</tspan>
</text>
<text
id="text357"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4444 4550 4728 4889 5066 5138 5299 5477 5655 5883 6095 6324 6501 6590 6768 6997 7175 7352 7424 7585 7691"
y="20507"
id="tspan359">receive_RSDataReply()</tspan>
</text>
<text
id="text373"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4457 4635 4741 4918 5096 5274 5452 5630 5807 5879 6057 6235 6464 6569 6641 6730 6908 7086 7247 7425 7585 7691"
y="21407"
id="tspan375">drbd_endio_write_sec()</tspan>
</text>
<text
id="text389"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4647 4825 5003 5180 5358 5536 5714 5820 5997 6158 6319 6497 6658 6836 7013 7085 7263 7424 7585 7691"
y="21907"
id="tspan391">e_end_resync_block()</tspan>
</text>
<path
d="M 11999,22601 L 11685,22640 L 11724,22444 L 11999,22601 z"
id="path401"
style="fill:#000080;visibility:visible" />
<path
d="M 7999,21801 L 11764,22554"
id="path405"
style="fill:none;stroke:#000080;visibility:visible" />
<text
x="4290.3008"
y="-2369.6162"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text423"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="13610.301 13911.301 14016.301 14088.301 14177.301 14355.301 14567.301 14728.301"
y="19573.385"
id="tspan425">WriteAck</tspan>
</text>
<text
id="text439"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12199 12377 12555 12644 12821 13033 13105 13283 13444 13604 13816 13977 14138 14244"
y="22559"
id="tspan441">got_BlockAck()</tspan>
</text>
<text
id="text455"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7999 8304 8541 8753 8964 9201 9413 9531 9769 9862 10099 10310 10522 10734 10852 10971 11208 11348 11585 11822"
y="16877"
id="tspan457">Resync blocks, 4-32K</tspan>
</text>
<path
d="M 12000,7601 L 11900,7301 L 12100,7301 L 12000,7601 z"
id="path467"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,6801 L 12000,7361"
id="path471"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 12000,6801 L 11686,6840 L 11725,6644 L 12000,6801 z"
id="path483"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,6001 L 11765,6754"
id="path487"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="-1288.1796"
y="1279.7666"
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
id="text505"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8174.8208 8475.8203 8580.8203 8652.8203 8741.8203 8919.8203 9131.8203 9292.8203"
y="9516.7666"
id="tspan507">WriteAck</tspan>
</text>
<path
d="M 8000,8601 L 8282,8458 L 8312,8655 L 8000,8601 z"
id="path517"
style="fill:#000080;visibility:visible" />
<path
d="M 12000,8001 L 8237,8565"
id="path521"
style="fill:none;stroke:#000080;visibility:visible" />
<text
x="1065.6655"
y="-2097.7664"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text539"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="10682.666 10911.666 11088.666 11177.666"
y="4107.2339"
id="tspan541">Data</tspan>
</text>
<text
id="text555"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
y="5505"
id="tspan557">drbd_make_request()</tspan>
</text>
<text
id="text571"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14190"
y="6806"
id="tspan573">receive_Data()</tspan>
</text>
<text
id="text587"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14207 14312 14384 14473 14651 14829 14990 15168 15328 15434"
y="7606"
id="tspan589">drbd_endio_write_sec()</tspan>
</text>
<text
id="text603"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12192 12370 12548 12725 12903 13081 13259 13437 13509 13686 13847 14008 14114"
y="8007"
id="tspan605">e_end_block()</tspan>
</text>
<text
id="text619"
style="font-size:318px;font-weight:400;fill:#000080;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5647 5825 6003 6092 6269 6481 6553 6731 6892 7052 7264 7425 7586 7692"
y="8606"
id="tspan621">got_BlockAck()</tspan>
</text>
<text
id="text635"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8000 8305 8542 8779 9016 9109 9346 9486 9604 9956 10049 10189 10328 10565 10705 10942 11179 11298 11603 11742 11835 11954 12191 12310 12428 12665 12902 13139 13279 13516 13753"
y="4877"
id="tspan637">Regular mirrored write, 512-32K</tspan>
</text>
<text
id="text651"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5381 5610 5787 5948 6126 6304 6482 6659 6837 7015 7087 7265 7426 7587 7692"
y="6003"
id="tspan653">w_send_dblock()</tspan>
</text>
<path
d="M 8000,6800 L 7900,6500 L 8100,6500 L 8000,6800 z"
id="path663"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,6000 L 8000,6560"
id="path667"
style="fill:none;stroke:#008000;visibility:visible" />
<text
id="text683"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4602 4780 4886 5063 5241 5419 5597 5775 5952 6024 6202 6380 6609 6714 6786 6875 7053 7231 7409 7515 7587 7692"
y="6905"
id="tspan685">drbd_endio_write_pri()</tspan>
</text>
<path
d="M 12000,13602 L 11900,13302 L 12100,13302 L 12000,13602 z"
id="path695"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,12802 L 12000,13362"
id="path699"
style="fill:none;stroke:#008000;visibility:visible" />
<path
d="M 12000,12802 L 11686,12841 L 11725,12645 L 12000,12802 z"
id="path711"
style="fill:#008000;visibility:visible" />
<path
d="M 8000,12002 L 11765,12755"
id="path715"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="-2155.5266"
y="1201.5964"
transform="matrix(0.9895258,-0.1443562,0.1443562,0.9895258,0,0)"
id="text733"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="7202.4736 7431.4736 7608.4736 7697.4736 7875.4736 8104.4736 8282.4736 8459.4736 8531.4736"
y="15454.597"
id="tspan735">DataReply</tspan>
</text>
<path
d="M 8000,14602 L 8282,14459 L 8312,14656 L 8000,14602 z"
id="path745"
style="fill:#008000;visibility:visible" />
<path
d="M 12000,14002 L 8237,14566"
id="path749"
style="fill:none;stroke:#008000;visibility:visible" />
<text
x="2280.3804"
y="-2103.2141"
transform="matrix(0.9788674,0.2044961,-0.2044961,0.9788674,0,0)"
id="text767"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="11316.381 11545.381 11722.381 11811.381 11989.381 12218.381 12396.381 12573.381 12751.381 12929.381 13090.381"
y="9981.7861"
id="tspan769">DataRequest</tspan>
</text>
<text
id="text783"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="4746 4924 5030 5207 5385 5563 5826 6003 6164 6342 6520 6626 6803 6981 7159 7337 7498 7587 7692"
y="11506"
id="tspan785">drbd_make_request()</tspan>
</text>
<text
id="text799"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12306 12484 12645 12822 12894 13055 13233 13411 13639 13817 13906 14084 14312 14490 14668 14846 15024 15185 15273 15379"
y="12807"
id="tspan801">receive_DataRequest()</tspan>
</text>
<text
id="text815"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12200 12378 12484 12661 12839 13017 13195 13373 13550 13622 13800 13978 14084 14262 14439 14617 14795 14956 15134 15295 15400"
y="13607"
id="tspan817">drbd_endio_read_sec()</tspan>
</text>
<text
id="text831"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="12192 12421 12598 12776 12954 13132 13310 13487 13665 13843 14021 14110 14288 14465 14571 14749 14927 15033"
y="14008"
id="tspan833">w_e_end_data_req()</tspan>
</text>
<g
id="g835"
style="visibility:visible">
<desc
id="desc837">Drawing</desc>
<text
id="text847"
style="font-size:318px;font-weight:400;fill:#008000;font-family:Helvetica embedded">
<tspan
x="4885 4991 5169 5330 5507 5579 5740 5918 6096 6324 6502 6591 6769 6997 7175 7353 7425 7586 7692"
y="14607"
id="tspan849">receive_DataReply()</tspan>
</text>
</g>
<text
id="text863"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="8000 8305 8398 8610 8821 8914 9151 9363 9575 9693 9833 10070 10307 10544 10663 10781 11018 11255 11493 11632 11869 12106"
y="10878"
id="tspan865">Diskless read, 512-32K</tspan>
</text>
<text
id="text879"
style="font-size:318px;font-weight:400;fill:#008000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="5029 5258 5435 5596 5774 5952 6130 6307 6413 6591 6769 6947 7125 7230 7408 7586 7692"
y="12004"
id="tspan881">w_send_read_req()</tspan>
</text>
<text
id="text895"
style="font-size:423px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="6961 7266 7571 7854 8159 8278 8515 8633 8870 9107 9226 9463 9581 9700 9793 10030"
y="2806"
id="tspan897">DRBD 8 data flow</tspan>
</text>
<path
d="M 3900,5300 L 3700,5300 L 3700,7000 L 3900,7000"
id="path907"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 3900,17600 L 3700,17600 L 3700,22000 L 3900,22000"
id="path919"
style="fill:none;stroke:#000000;visibility:visible" />
<path
d="M 16100,20000 L 16300,20000 L 16300,18500 L 16100,18500"
id="path931"
style="fill:none;stroke:#000000;visibility:visible" />
<text
id="text947"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2126 2304 2376 2554 2731 2909 3087 3159 3337 3515 3587 3764 3870"
y="5202"
id="tspan949">al_begin_io()</tspan>
</text>
<text
id="text963"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1632 1810 1882 2060 2220 2398 2661 2839 2910 3088 3177 3355 3533 3605 3783 3888"
y="7331"
id="tspan965">al_complete_io()</tspan>
</text>
<text
id="text979"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="2126 2232 2393 2571 2748 2926 3104 3176 3354 3531 3603 3781 3887"
y="17431"
id="tspan981">rs_begin_io()</tspan>
</text>
<text
id="text995"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="1626 1732 1893 2071 2231 2409 2672 2849 2921 3099 3188 3366 3544 3616 3793 3899"
y="22331"
id="tspan997">rs_complete_io()</tspan>
</text>
<text
id="text1011"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16027 16133 16294 16472 16649 16827 17005 17077 17255 17432 17504 17682 17788"
y="18402"
id="tspan1013">rs_begin_io()</tspan>
</text>
<text
id="text1027"
style="font-size:318px;font-weight:400;fill:#000000;visibility:visible;font-family:Helvetica embedded">
<tspan
x="16115 16221 16382 16560 16720 16898 17161 17338 17410 17588 17677 17855 18033 18105 18282 18388"
y="20331"
id="tspan1029">rs_complete_io()</tspan>
</text>
</svg>
Description
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Please visit http://www.drbd.org to find out more.
The here included files are intended to help understand the implementation
DRBD-8.3-data-packets.svg, DRBD-data-packets.svg
relates some functions, and write packets.
conn-states-8.dot, disk-states-8.dot, node-states-8.dot
The sub graphs of DRBD's state transitions
digraph conn_states {
StandAllone -> WFConnection [ label = "ioctl_set_net()" ]
WFConnection -> Unconnected [ label = "unable to bind()" ]
WFConnection -> WFReportParams [ label = "in connect() after accept" ]
WFReportParams -> StandAllone [ label = "checks in receive_param()" ]
WFReportParams -> Connected [ label = "in receive_param()" ]
WFReportParams -> WFBitMapS [ label = "sync_handshake()" ]
WFReportParams -> WFBitMapT [ label = "sync_handshake()" ]
WFBitMapS -> SyncSource [ label = "receive_bitmap()" ]
WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ]
SyncSource -> Connected
SyncTarget -> Connected
SyncSource -> PausedSyncS
SyncTarget -> PausedSyncT
PausedSyncS -> SyncSource
PausedSyncT -> SyncTarget
Connected -> WFConnection [ label = "* on network error" ]
}
digraph disk_states {
Diskless -> Inconsistent [ label = "ioctl_set_disk()" ]
Diskless -> Consistent [ label = "ioctl_set_disk()" ]
Diskless -> Outdated [ label = "ioctl_set_disk()" ]
Consistent -> Outdated [ label = "receive_param()" ]
Consistent -> UpToDate [ label = "receive_param()" ]
Consistent -> Inconsistent [ label = "start resync" ]
Outdated -> Inconsistent [ label = "start resync" ]
UpToDate -> Inconsistent [ label = "ioctl_replicate" ]
Inconsistent -> UpToDate [ label = "resync completed" ]
Consistent -> Failed [ label = "io completion error" ]
Outdated -> Failed [ label = "io completion error" ]
UpToDate -> Failed [ label = "io completion error" ]
Inconsistent -> Failed [ label = "io completion error" ]
Failed -> Diskless [ label = "sending notify to peer" ]
}
// vim: set sw=2 sts=2 :
digraph {
rankdir=BT
bgcolor=white
node [shape=plaintext]
node [fontcolor=black]
StandAlone [ style=filled,fillcolor=gray,label=StandAlone ]
node [fontcolor=lightgray]
Unconnected [ label=Unconnected ]
CommTrouble [ shape=record,
label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ]
node [fontcolor=gray]
subgraph cluster_try_connect {
label="try to connect, handshake"
rank=max
WFConnection [ label=WFConnection ]
WFReportParams [ label=WFReportParams ]
}
TearDown [ label=TearDown ]
Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ]
node [fontcolor=lightblue]
StartingSyncS [ label=StartingSyncS ]
StartingSyncT [ label=StartingSyncT ]
subgraph cluster_bitmap_exchange {
node [fontcolor=red]
fontcolor=red
label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged"
WFBitMapT [ label=WFBitMapT ]
WFSyncUUID [ label=WFSyncUUID ]
WFBitMapS [ label=WFBitMapS ]
}
node [fontcolor=blue]
cluster_resync [ shape=record,label="{<any>resynchronisation process running\l'concurrent' application requests allowed|{{<T>PausedSyncT\nSyncTarget}|{<S>PausedSyncS\nSyncSource}}}" ]
node [shape=box,fontcolor=black]
// drbdadm [label="drbdadm connect"]
// handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."]
// comm_error [label="communication trouble"]
//
// edges
// --------------------------------------
StandAlone -> Unconnected [ label="drbdadm connect" ]
Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ]
Unconnected -> WFConnection [ label="receiver thread is started" ]
WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ]
WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ]
WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ]
WFReportParams -> WFBitMapS
WFReportParams -> WFBitMapT
WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false]
WFBitMapS -> cluster_resync:S
WFSyncUUID -> cluster_resync:T
edge [color=green]
cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ]
edge [color=red]
WFReportParams -> CommTrouble
Connected -> CommTrouble
cluster_resync:any -> CommTrouble
edge [color=black]
CommTrouble -> Unconnected [label="receiver thread is stopped" ]
}
digraph node_states {
Secondary -> Primary [ label = "ioctl_set_state()" ]
Primary -> Secondary [ label = "ioctl_set_state()" ]
}
digraph peer_states {
Secondary -> Primary [ label = "recv state packet" ]
Primary -> Secondary [ label = "recv state packet" ]
Primary -> Unknown [ label = "connection lost" ]
Secondary -> Unknown [ label = "connection lost" ]
Unknown -> Primary [ label = "connected" ]
Unknown -> Secondary [ label = "connected" ]
}
...@@ -1758,6 +1758,19 @@ S: Maintained ...@@ -1758,6 +1758,19 @@ S: Maintained
F: drivers/scsi/dpt* F: drivers/scsi/dpt*
F: drivers/scsi/dpt/ F: drivers/scsi/dpt/
DRBD DRIVER
P: Philipp Reisner
P: Lars Ellenberg
M: drbd-dev@lists.linbit.com
L: drbd-user@lists.linbit.com
W: http://www.drbd.org
T: git git://git.drbd.org/linux-2.6-drbd.git drbd
T: git git://git.drbd.org/drbd-8.3.git
S: Supported
F: drivers/block/drbd/
F: lib/lru_cache.c
F: Documentation/blockdev/drbd/
DRIVER CORE, KOBJECTS, AND SYSFS DRIVER CORE, KOBJECTS, AND SYSFS
M: Greg Kroah-Hartman <gregkh@suse.de> M: Greg Kroah-Hartman <gregkh@suse.de>
T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
......
...@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP ...@@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP
instead, which can be configured to be on-disk compatible with the instead, which can be configured to be on-disk compatible with the
cryptoloop device. cryptoloop device.
source "drivers/block/drbd/Kconfig"
config BLK_DEV_NBD config BLK_DEV_NBD
tristate "Network block device support" tristate "Network block device support"
depends on NET depends on NET
......
...@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o ...@@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o
obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
swim_mod-objs := swim.o swim_asm.o swim_mod-objs := swim.o swim_asm.o
#
# DRBD device driver configuration
#
comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected"
depends on !PROC_FS || !INET || !CONNECTOR
config BLK_DEV_DRBD
tristate "DRBD Distributed Replicated Block Device support"
depends on PROC_FS && INET && CONNECTOR
select LRU_CACHE
default n
help
NOTE: In order to authenticate connections you have to select
CRYPTO_HMAC and a hash function as well.
DRBD is a shared-nothing, synchronously replicated block device. It
is designed to serve as a building block for high availability
clusters and in this context, is a "drop-in" replacement for shared
storage. Simplistically, you could see it as a network RAID 1.
Each minor device has a role, which can be 'primary' or 'secondary'.
On the node with the primary device the application is supposed to
run and to access the device (/dev/drbdX). Every write is sent to
the local 'lower level block device' and, across the network, to the
node with the device in 'secondary' state. The secondary device
simply writes the data to its lower level block device.
DRBD can also be used in dual-Primary mode (device writable on both
nodes), which means it can exhibit shared disk semantics in a
shared-nothing cluster. Needless to say, on top of dual-Primary
DRBD utilizing a cluster file system is necessary to maintain for
cache coherency.
For automatic failover you need a cluster manager (e.g. heartbeat).
See also: http://www.drbd.org/, http://www.linux-ha.org
If unsure, say N.
config DRBD_TRACE
tristate "DRBD tracing"
depends on BLK_DEV_DRBD
select TRACEPOINTS
default n
help
Say Y here if you want to be able to trace various events in DRBD.
If unsure, say N.
config DRBD_FAULT_INJECTION
bool "DRBD fault injection"
depends on BLK_DEV_DRBD
help
Say Y here if you want to simulate IO errors, in order to test DRBD's
behavior.
The actual simulation of IO errors is done by writing 3 values to
/sys/module/drbd/parameters/
enable_faults: bitmask of...
1 meta data write
2 read
4 resync data write
8 read
16 data write
32 data read
64 read ahead
128 kmalloc of bitmap
256 allocation of EE (epoch_entries)
fault_devs: bitmask of minor numbers
fault_rate: frequency in percent
Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
echo 16 > /sys/module/drbd/parameters/enable_faults
echo 1 > /sys/module/drbd/parameters/fault_devs
echo 5 > /sys/module/drbd/parameters/fault_rate
If unsure, say N.
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
drbd_trace-y := drbd_tracing.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o
/*
drbd_actlog.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/slab.h>
#include <linux/drbd.h>
#include "drbd_int.h"
#include "drbd_tracing.h"
#include "drbd_wrappers.h"
/* We maintain a trivial check sum in our on disk activity log.
* With that we can ensure correct operation even when the storage
* device might do a partial (last) sector write while loosing power.
*/
struct __packed al_transaction {
u32 magic;
u32 tr_number;
struct __packed {
u32 pos;
u32 extent; } updates[1 + AL_EXTENTS_PT];
u32 xor_sum;
};
struct update_odbm_work {
struct drbd_work w;
unsigned int enr;
};
struct update_al_work {
struct drbd_work w;
struct lc_element *al_ext;
struct completion event;
unsigned int enr;
/* if old_enr != LC_FREE, write corresponding bitmap sector, too */
unsigned int old_enr;
};
struct drbd_atodb_wait {
atomic_t count;
struct completion io_done;
struct drbd_conf *mdev;
int error;
};
int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
/* The actual tracepoint needs to have constant number of known arguments...
*/
void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
trace__drbd_resync(mdev, level, fmt, ap);
va_end(ap);
}
static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev,
struct page *page, sector_t sector,
int rw, int size)
{
struct bio *bio;
struct drbd_md_io md_io;
int ok;
md_io.mdev = mdev;
init_completion(&md_io.event);
md_io.error = 0;
if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags))
rw |= (1 << BIO_RW_BARRIER);
rw |= ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO));
retry:
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_bdev = bdev->md_bdev;
bio->bi_sector = sector;
ok = (bio_add_page(bio, page, size, 0) == size);
if (!ok)
goto out;
bio->bi_private = &md_io;
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
trace_drbd_bio(mdev, "Md", bio, 0, NULL);
if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_endio(bio, -EIO);
else
submit_bio(rw, bio);
wait_for_completion(&md_io.event);
ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
/* check for unsupported barrier op.
* would rather check on EOPNOTSUPP, but that is not reliable.
* don't try again for ANY return value != 0 */
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) {
/* Try again with no barrier */
dev_warn(DEV, "Barriers not supported on meta data device - disabling\n");
set_bit(MD_NO_BARRIER, &mdev->flags);
rw &= ~(1 << BIO_RW_BARRIER);
bio_put(bio);
goto retry;
}
out:
bio_put(bio);
return ok;
}
int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
sector_t sector, int rw)
{
int logical_block_size, mask, ok;
int offset = 0;
struct page *iop = mdev->md_io_page;
D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
BUG_ON(!bdev->md_bdev);
logical_block_size = bdev_logical_block_size(bdev->md_bdev);
if (logical_block_size == 0)
logical_block_size = MD_SECTOR_SIZE;
/* in case logical_block_size != 512 [ s390 only? ] */
if (logical_block_size != MD_SECTOR_SIZE) {
mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
D_ASSERT(mask == 1 || mask == 3 || mask == 7);
D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
offset = sector & mask;
sector = sector & ~mask;
iop = mdev->md_io_tmpp;
if (rw & WRITE) {
/* these are GFP_KERNEL pages, pre-allocated
* on device initialization */
void *p = page_address(mdev->md_io_page);
void *hp = page_address(mdev->md_io_tmpp);
ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
READ, logical_block_size);
if (unlikely(!ok)) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
"READ [logical_block_size!=512]) failed!\n",
(unsigned long long)sector);
return 0;
}
memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
}
}
if (sector < drbd_md_first_sector(bdev) ||
sector > drbd_md_last_sector(bdev))
dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
if (unlikely(!ok)) {
dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
return 0;
}
if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
void *p = page_address(mdev->md_io_page);
void *hp = page_address(mdev->md_io_tmpp);
memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
}
return ok;
}
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *al_ext;
struct lc_element *tmp;
unsigned long al_flags = 0;
spin_lock_irq(&mdev->al_lock);
tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT);
if (unlikely(tmp != NULL)) {
struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
spin_unlock_irq(&mdev->al_lock);
return NULL;
}
}
al_ext = lc_get(mdev->act_log, enr);
al_flags = mdev->act_log->flags;
spin_unlock_irq(&mdev->al_lock);
/*
if (!al_ext) {
if (al_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
if (al_flags & LC_DIRTY)
dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
}
*/
return al_ext;
}
void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
{
unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
struct lc_element *al_ext;
struct update_al_work al_work;
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
trace_drbd_actlog(mdev, sector, "al_begin_io");
wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
if (al_ext->lc_number != enr) {
/* drbd_al_write_transaction(mdev,al_ext,enr);
* recurses into generic_make_request(), which
* disallows recursion, bios being serialized on the
* current->bio_tail list now.
* we have to delegate updates to the activity log
* to the worker thread. */
init_completion(&al_work.event);
al_work.al_ext = al_ext;
al_work.enr = enr;
al_work.old_enr = al_ext->lc_number;
al_work.w.cb = w_al_write_transaction;
drbd_queue_work_front(&mdev->data.work, &al_work.w);
wait_for_completion(&al_work.event);
mdev->al_writ_cnt++;
spin_lock_irq(&mdev->al_lock);
lc_changed(mdev->act_log, al_ext);
spin_unlock_irq(&mdev->al_lock);
wake_up(&mdev->al_wait);
}
}
void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
{
unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
struct lc_element *extent;
unsigned long flags;
trace_drbd_actlog(mdev, sector, "al_complete_io");
spin_lock_irqsave(&mdev->al_lock, flags);
extent = lc_find(mdev->act_log, enr);
if (!extent) {
spin_unlock_irqrestore(&mdev->al_lock, flags);
dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
return;
}
if (lc_put(mdev->act_log, extent) == 0)
wake_up(&mdev->al_wait);
spin_unlock_irqrestore(&mdev->al_lock, flags);
}
int
w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
struct update_al_work *aw = container_of(w, struct update_al_work, w);
struct lc_element *updated = aw->al_ext;
const unsigned int new_enr = aw->enr;
const unsigned int evicted = aw->old_enr;
struct al_transaction *buffer;
sector_t sector;
int i, n, mx;
unsigned int extent_nr;
u32 xor_sum = 0;
if (!get_ldev(mdev)) {
dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n");
complete(&((struct update_al_work *)w)->event);
return 1;
}
/* do we have to do a bitmap write, first?
* TODO reduce maximum latency:
* submit both bios, then wait for both,
* instead of doing two synchronous sector writes. */
if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT);
mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */
buffer = (struct al_transaction *)page_address(mdev->md_io_page);
buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
n = lc_index_of(mdev->act_log, updated);
buffer->updates[0].pos = cpu_to_be32(n);
buffer->updates[0].extent = cpu_to_be32(new_enr);
xor_sum ^= new_enr;
mx = min_t(int, AL_EXTENTS_PT,
mdev->act_log->nr_elements - mdev->al_tr_cycle);
for (i = 0; i < mx; i++) {
unsigned idx = mdev->al_tr_cycle + i;
extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
buffer->updates[i+1].pos = cpu_to_be32(idx);
buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
xor_sum ^= extent_nr;
}
for (; i < AL_EXTENTS_PT; i++) {
buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
xor_sum ^= LC_FREE;
}
mdev->al_tr_cycle += AL_EXTENTS_PT;
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
mdev->al_tr_cycle = 0;
buffer->xor_sum = cpu_to_be32(xor_sum);
sector = mdev->ldev->md.md_offset
+ mdev->ldev->md.al_offset + mdev->al_tr_pos;
if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
drbd_chk_io_error(mdev, 1, TRUE);
if (++mdev->al_tr_pos >
div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
mdev->al_tr_pos = 0;
D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
mdev->al_tr_number++;
mutex_unlock(&mdev->md_io_mutex);
complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
return 1;
}
/**
* drbd_al_read_tr() - Read a single transaction from the on disk activity log
* @mdev: DRBD device.
* @bdev: Block device to read form.
* @b: pointer to an al_transaction.
* @index: On disk slot of the transaction to read.
*
* Returns -1 on IO error, 0 on checksum error and 1 upon success.
*/
static int drbd_al_read_tr(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev,
struct al_transaction *b,
int index)
{
sector_t sector;
int rv, i;
u32 xor_sum = 0;
sector = bdev->md.md_offset + bdev->md.al_offset + index;
/* Dont process error normally,
* as this is done before disk is attached! */
if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
return -1;
rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
for (i = 0; i < AL_EXTENTS_PT + 1; i++)
xor_sum ^= be32_to_cpu(b->updates[i].extent);
rv &= (xor_sum == be32_to_cpu(b->xor_sum));
return rv;
}
/**
* drbd_al_read_log() - Restores the activity log from its on disk representation.
* @mdev: DRBD device.
* @bdev: Block device to read form.
*
* Returns 1 on success, returns 0 when reading the log failed due to IO errors.
*/
int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
{
struct al_transaction *buffer;
int i;
int rv;
int mx;
int active_extents = 0;
int transactions = 0;
int found_valid = 0;
int from = 0;
int to = 0;
u32 from_tnr = 0;
u32 to_tnr = 0;
u32 cnr;
mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
/* lock out all other meta data io for now,
* and make sure the page is mapped.
*/
mutex_lock(&mdev->md_io_mutex);
buffer = page_address(mdev->md_io_page);
/* Find the valid transaction in the log */
for (i = 0; i <= mx; i++) {
rv = drbd_al_read_tr(mdev, bdev, buffer, i);
if (rv == 0)
continue;
if (rv == -1) {
mutex_unlock(&mdev->md_io_mutex);
return 0;
}
cnr = be32_to_cpu(buffer->tr_number);
if (++found_valid == 1) {
from = i;
to = i;
from_tnr = cnr;
to_tnr = cnr;
continue;
}
if ((int)cnr - (int)from_tnr < 0) {
D_ASSERT(from_tnr - cnr + i - from == mx+1);
from = i;
from_tnr = cnr;
}
if ((int)cnr - (int)to_tnr > 0) {
D_ASSERT(cnr - to_tnr == i - to);
to = i;
to_tnr = cnr;
}
}
if (!found_valid) {
dev_warn(DEV, "No usable activity log found.\n");
mutex_unlock(&mdev->md_io_mutex);
return 1;
}
/* Read the valid transactions.
* dev_info(DEV, "Reading from %d to %d.\n",from,to); */
i = from;
while (1) {
int j, pos;
unsigned int extent_nr;
unsigned int trn;
rv = drbd_al_read_tr(mdev, bdev, buffer, i);
ERR_IF(rv == 0) goto cancel;
if (rv == -1) {
mutex_unlock(&mdev->md_io_mutex);
return 0;
}
trn = be32_to_cpu(buffer->tr_number);
spin_lock_irq(&mdev->al_lock);
/* This loop runs backwards because in the cyclic
elements there might be an old version of the
updated element (in slot 0). So the element in slot 0
can overwrite old versions. */
for (j = AL_EXTENTS_PT; j >= 0; j--) {
pos = be32_to_cpu(buffer->updates[j].pos);
extent_nr = be32_to_cpu(buffer->updates[j].extent);
if (extent_nr == LC_FREE)
continue;
lc_set(mdev->act_log, extent_nr, pos);
active_extents++;
}
spin_unlock_irq(&mdev->al_lock);
transactions++;
cancel:
if (i == to)
break;
i++;
if (i > mx)
i = 0;
}
mdev->al_tr_number = to_tnr+1;
mdev->al_tr_pos = to;
if (++mdev->al_tr_pos >
div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
mdev->al_tr_pos = 0;
/* ok, we are done with it */
mutex_unlock(&mdev->md_io_mutex);
dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
transactions, active_extents);
return 1;
}
static void atodb_endio(struct bio *bio, int error)
{
struct drbd_atodb_wait *wc = bio->bi_private;
struct drbd_conf *mdev = wc->mdev;
struct page *page;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
if (!error && !uptodate)
error = -EIO;
drbd_chk_io_error(mdev, error, TRUE);
if (error && wc->error == 0)
wc->error = error;
if (atomic_dec_and_test(&wc->count))
complete(&wc->io_done);
page = bio->bi_io_vec[0].bv_page;
put_page(page);
bio_put(bio);
mdev->bm_writ_cnt++;
put_ldev(mdev);
}
#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
/* activity log to on disk bitmap -- prepare bio unless that sector
* is already covered by previously prepared bios */
static int atodb_prepare_unless_covered(struct drbd_conf *mdev,
struct bio **bios,
unsigned int enr,
struct drbd_atodb_wait *wc) __must_hold(local)
{
struct bio *bio;
struct page *page;
sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+ mdev->ldev->md.bm_offset;
unsigned int page_offset = PAGE_SIZE;
int offset;
int i = 0;
int err = -ENOMEM;
/* Check if that enr is already covered by an already created bio.
* Caution, bios[] is not NULL terminated,
* but only initialized to all NULL.
* For completely scattered activity log,
* the last invocation iterates over all bios,
* and finds the last NULL entry.
*/
while ((bio = bios[i])) {
if (bio->bi_sector == on_disk_sector)
return 0;
i++;
}
/* bios[i] == NULL, the next not yet used slot */
/* GFP_KERNEL, we are not in the write-out path */
bio = bio_alloc(GFP_KERNEL, 1);
if (bio == NULL)
return -ENOMEM;
if (i > 0) {
const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec;
page_offset = prev_bv->bv_offset + prev_bv->bv_len;
page = prev_bv->bv_page;
}
if (page_offset == PAGE_SIZE) {
page = alloc_page(__GFP_HIGHMEM);
if (page == NULL)
goto out_bio_put;
page_offset = 0;
} else {
get_page(page);
}
offset = S2W(enr);
drbd_bm_get_lel(mdev, offset,
min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset),
kmap(page) + page_offset);
kunmap(page);
bio->bi_private = wc;
bio->bi_end_io = atodb_endio;
bio->bi_bdev = mdev->ldev->md_bdev;
bio->bi_sector = on_disk_sector;
if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE)
goto out_put_page;
atomic_inc(&wc->count);
/* we already know that we may do this...
* get_ldev_if_state(mdev,D_ATTACHING);
* just get the extra reference, so that the local_cnt reflects
* the number of pending IO requests DRBD at its backing device.
*/
atomic_inc(&mdev->local_cnt);
bios[i] = bio;
return 0;
out_put_page:
err = -EINVAL;
put_page(page);
out_bio_put:
bio_put(bio);
return err;
}
/**
* drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents
* @mdev: DRBD device.
*
* Called when we detach (unconfigure) local storage,
* or when we go from R_PRIMARY to R_SECONDARY role.
*/
void drbd_al_to_on_disk_bm(struct drbd_conf *mdev)
{
int i, nr_elements;
unsigned int enr;
struct bio **bios;
struct drbd_atodb_wait wc;
ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING))
return; /* sorry, I don't have any act_log etc... */
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
nr_elements = mdev->act_log->nr_elements;
/* GFP_KERNEL, we are not in anyone's write-out path */
bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL);
if (!bios)
goto submit_one_by_one;
atomic_set(&wc.count, 0);
init_completion(&wc.io_done);
wc.mdev = mdev;
wc.error = 0;
for (i = 0; i < nr_elements; i++) {
enr = lc_element_by_index(mdev->act_log, i)->lc_number;
if (enr == LC_FREE)
continue;
/* next statement also does atomic_inc wc.count and local_cnt */
if (atodb_prepare_unless_covered(mdev, bios,
enr/AL_EXT_PER_BM_SECT,
&wc))
goto free_bios_submit_one_by_one;
}
/* unnecessary optimization? */
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
/* all prepared, submit them */
for (i = 0; i < nr_elements; i++) {
if (bios[i] == NULL)
break;
if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) {
bios[i]->bi_rw = WRITE;
bio_endio(bios[i], -EIO);
} else {
submit_bio(WRITE, bios[i]);
}
}
drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
/* always (try to) flush bitmap to stable storage */
drbd_md_flush(mdev);
/* In case we did not submit a single IO do not wait for
* them to complete. ( Because we would wait forever here. )
*
* In case we had IOs and they are already complete, there
* is not point in waiting anyways.
* Therefore this if () ... */
if (atomic_read(&wc.count))
wait_for_completion(&wc.io_done);
put_ldev(mdev);
kfree(bios);
return;
free_bios_submit_one_by_one:
/* free everything by calling the endio callback directly. */
for (i = 0; i < nr_elements && bios[i]; i++)
bio_endio(bios[i], 0);
kfree(bios);
submit_one_by_one:
dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n");
for (i = 0; i < mdev->act_log->nr_elements; i++) {
enr = lc_element_by_index(mdev->act_log, i)->lc_number;
if (enr == LC_FREE)
continue;
/* Really slow: if we have al-extents 16..19 active,
* sector 4 will be written four times! Synchronous! */
drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT);
}
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
put_ldev(mdev);
}
/**
* drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
* @mdev: DRBD device.
*/
void drbd_al_apply_to_bm(struct drbd_conf *mdev)
{
unsigned int enr;
unsigned long add = 0;
char ppb[10];
int i;
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
for (i = 0; i < mdev->act_log->nr_elements; i++) {
enr = lc_element_by_index(mdev->act_log, i)->lc_number;
if (enr == LC_FREE)
continue;
add += drbd_bm_ALe_set_all(mdev, enr);
}
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
ppsize(ppb, Bit2KB(add)));
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
{
int rv;
spin_lock_irq(&mdev->al_lock);
rv = (al_ext->refcnt == 0);
if (likely(rv))
lc_del(mdev->act_log, al_ext);
spin_unlock_irq(&mdev->al_lock);
return rv;
}
/**
* drbd_al_shrink() - Removes all active extents form the activity log
* @mdev: DRBD device.
*
* Removes all active extents form the activity log, waiting until
* the reference count of each entry dropped to 0 first, of course.
*
* You need to lock mdev->act_log with lc_try_lock() / lc_unlock()
*/
void drbd_al_shrink(struct drbd_conf *mdev)
{
struct lc_element *al_ext;
int i;
D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
for (i = 0; i < mdev->act_log->nr_elements; i++) {
al_ext = lc_element_by_index(mdev->act_log, i);
if (al_ext->lc_number == LC_FREE)
continue;
wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext));
}
wake_up(&mdev->al_wait);
}
static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
kfree(udw);
return 1;
}
drbd_bm_write_sect(mdev, udw->enr);
put_ldev(mdev);
kfree(udw);
if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) {
switch (mdev->state.conn) {
case C_SYNC_SOURCE: case C_SYNC_TARGET:
case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T:
drbd_resync_finished(mdev);
default:
/* nothing to do */
break;
}
}
drbd_bcast_sync_progress(mdev);
return 1;
}
/* ATTENTION. The AL's extents are 4MB each, while the extents in the
* resync LRU-cache are 16MB each.
* The caller of this function has to hold an get_ldev() reference.
*
* TODO will be obsoleted once we have a caching lru of the on disk bitmap
*/
static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
int count, int success)
{
struct lc_element *e;
struct update_odbm_work *udw;
unsigned int enr;
D_ASSERT(atomic_read(&mdev->local_cnt));
/* I simply assume that a sector/size pair never crosses
* a 16 MB extent border. (Currently this is true...) */
enr = BM_SECT_TO_EXT(sector);
e = lc_get(mdev->resync, enr);
if (e) {
struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
if (ext->lce.lc_number == enr) {
if (success)
ext->rs_left -= count;
else
ext->rs_failed += count;
if (ext->rs_left < ext->rs_failed) {
dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
"rs_failed=%d count=%d\n",
(unsigned long long)sector,
ext->lce.lc_number, ext->rs_left,
ext->rs_failed, count);
dump_stack();
lc_put(mdev->resync, &ext->lce);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return;
}
} else {
/* Normally this element should be in the cache,
* since drbd_rs_begin_io() pulled it already in.
*
* But maybe an application write finished, and we set
* something outside the resync lru_cache in sync.
*/
int rs_left = drbd_bm_e_weight(mdev, enr);
if (ext->flags != 0) {
dev_warn(DEV, "changing resync lce: %d[%u;%02lx]"
" -> %d[%u;00]\n",
ext->lce.lc_number, ext->rs_left,
ext->flags, enr, rs_left);
ext->flags = 0;
}
if (ext->rs_failed) {
dev_warn(DEV, "Kicking resync_lru element enr=%u "
"out with rs_failed=%d\n",
ext->lce.lc_number, ext->rs_failed);
set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
}
ext->rs_left = rs_left;
ext->rs_failed = success ? 0 : count;
lc_changed(mdev->resync, &ext->lce);
}
lc_put(mdev->resync, &ext->lce);
/* no race, we are within the al_lock! */
if (ext->rs_left == ext->rs_failed) {
ext->rs_failed = 0;
udw = kmalloc(sizeof(*udw), GFP_ATOMIC);
if (udw) {
udw->enr = ext->lce.lc_number;
udw->w.cb = w_update_odbm;
drbd_queue_work_front(&mdev->data.work, &udw->w);
} else {
dev_warn(DEV, "Could not kmalloc an udw\n");
set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
}
}
} else {
dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n",
mdev->resync_locked,
mdev->resync->nr_elements,
mdev->resync->flags);
}
}
/* clear the bit corresponding to the piece of storage in question:
* size byte of data starting from sector. Only clear a bits of the affected
* one ore more _aligned_ BM_BLOCK_SIZE blocks.
*
* called by worker on C_SYNC_TARGET and receiver on SyncSource.
*
*/
void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
const char *file, const unsigned int line)
{
/* Is called from worker and receiver context _only_ */
unsigned long sbnr, ebnr, lbnr;
unsigned long count = 0;
sector_t esector, nr_sectors;
int wake_up = 0;
unsigned long flags;
if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
ERR_IF(sector >= nr_sectors) return;
ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
/* we clear it (in sync).
* round up start sector, round down end sector. we make sure we only
* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
if (unlikely(esector < BM_SECT_PER_BIT-1))
return;
if (unlikely(esector == (nr_sectors-1)))
ebnr = lbnr;
else
ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
trace_drbd_resync(mdev, TRACE_LVL_METRICS,
"drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
(unsigned long long)sector, size, sbnr, ebnr);
if (sbnr > ebnr)
return;
/*
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
spin_lock_irqsave(&mdev->al_lock, flags);
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
if (count) {
/* we need the lock for drbd_try_clear_on_disk_bm */
if (jiffies - mdev->rs_mark_time > HZ*10) {
/* should be rolling marks,
* but we estimate only anyways. */
if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
mdev->state.conn != C_PAUSED_SYNC_T &&
mdev->state.conn != C_PAUSED_SYNC_S) {
mdev->rs_mark_time = jiffies;
mdev->rs_mark_left = drbd_bm_total_weight(mdev);
}
}
if (get_ldev(mdev)) {
drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
put_ldev(mdev);
}
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
}
spin_unlock_irqrestore(&mdev->al_lock, flags);
if (wake_up)
wake_up(&mdev->al_wait);
}
/*
* this is intended to set one request worth of data out of sync.
* affects at least 1 bit,
* and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits.
*
* called by tl_clear and drbd_send_dblock (==drbd_make_request).
* so this can be _any_ process.
*/
void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
const char *file, const unsigned int line)
{
unsigned long sbnr, ebnr, lbnr, flags;
sector_t esector, nr_sectors;
unsigned int enr, count;
struct lc_element *e;
if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
dev_err(DEV, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
return;
}
if (!get_ldev(mdev))
return; /* no disk, no metadata, no bitmap to set bits in */
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
ERR_IF(sector >= nr_sectors)
goto out;
ERR_IF(esector >= nr_sectors)
esector = (nr_sectors-1);
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
/* we set it out of sync,
* we do not need to round anything here */
sbnr = BM_SECT_TO_BIT(sector);
ebnr = BM_SECT_TO_BIT(esector);
trace_drbd_resync(mdev, TRACE_LVL_METRICS,
"drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n",
(unsigned long long)sector, size, sbnr, ebnr);
/* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors. */
spin_lock_irqsave(&mdev->al_lock, flags);
count = drbd_bm_set_bits(mdev, sbnr, ebnr);
enr = BM_SECT_TO_EXT(sector);
e = lc_find(mdev->resync, enr);
if (e)
lc_entry(e, struct bm_extent, lce)->rs_left += count;
spin_unlock_irqrestore(&mdev->al_lock, flags);
out:
put_ldev(mdev);
}
static
struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *e;
struct bm_extent *bm_ext;
int wakeup = 0;
unsigned long rs_flags;
spin_lock_irq(&mdev->al_lock);
if (mdev->resync_locked > mdev->resync->nr_elements/2) {
spin_unlock_irq(&mdev->al_lock);
return NULL;
}
e = lc_get(mdev->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext) {
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
lc_changed(mdev->resync, &bm_ext->lce);
wakeup = 1;
}
if (bm_ext->lce.refcnt == 1)
mdev->resync_locked++;
set_bit(BME_NO_WRITES, &bm_ext->flags);
}
rs_flags = mdev->resync->flags;
spin_unlock_irq(&mdev->al_lock);
if (wakeup)
wake_up(&mdev->al_wait);
if (!bm_ext) {
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
BUG_ON(rs_flags & LC_DIRTY);
}
return bm_ext;
}
static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *al_ext;
int rv = 0;
spin_lock_irq(&mdev->al_lock);
if (unlikely(enr == mdev->act_log->new_number))
rv = 1;
else {
al_ext = lc_find(mdev->act_log, enr);
if (al_ext) {
if (al_ext->refcnt)
rv = 1;
}
}
spin_unlock_irq(&mdev->al_lock);
/*
if (unlikely(rv)) {
dev_info(DEV, "Delaying sync read until app's write is done\n");
}
*/
return rv;
}
/**
* drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
* @mdev: DRBD device.
* @sector: The sector number.
*
* This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
*/
int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
{
unsigned int enr = BM_SECT_TO_EXT(sector);
struct bm_extent *bm_ext;
int i, sig;
trace_drbd_resync(mdev, TRACE_LVL_ALL,
"drbd_rs_begin_io: sector=%llus (rs_end=%d)\n",
(unsigned long long)sector, enr);
sig = wait_event_interruptible(mdev->al_wait,
(bm_ext = _bme_get(mdev, enr)));
if (sig)
return 0;
if (test_bit(BME_LOCKED, &bm_ext->flags))
return 1;
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
sig = wait_event_interruptible(mdev->al_wait,
!_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i));
if (sig) {
spin_lock_irq(&mdev->al_lock);
if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
clear_bit(BME_NO_WRITES, &bm_ext->flags);
mdev->resync_locked--;
wake_up(&mdev->al_wait);
}
spin_unlock_irq(&mdev->al_lock);
return 0;
}
}
set_bit(BME_LOCKED, &bm_ext->flags);
return 1;
}
/**
* drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
* @mdev: DRBD device.
* @sector: The sector number.
*
* Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
* tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
* if there is still application IO going on in this area.
*/
int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
{
unsigned int enr = BM_SECT_TO_EXT(sector);
const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
struct lc_element *e;
struct bm_extent *bm_ext;
int i;
trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n",
(unsigned long long)sector);
spin_lock_irq(&mdev->al_lock);
if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) {
/* in case you have very heavy scattered io, it may
* stall the syncer undefined if we give up the ref count
* when we try again and requeue.
*
* if we don't give up the refcount, but the next time
* we are scheduled this extent has been "synced" by new
* application writes, we'd miss the lc_put on the
* extent we keep the refcount on.
* so we remembered which extent we had to try again, and
* if the next requested one is something else, we do
* the lc_put here...
* we also have to wake_up
*/
trace_drbd_resync(mdev, TRACE_LVL_ALL,
"dropping %u, apparently got 'synced' by application io\n",
mdev->resync_wenr);
e = lc_find(mdev->resync, mdev->resync_wenr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext) {
D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
clear_bit(BME_NO_WRITES, &bm_ext->flags);
mdev->resync_wenr = LC_FREE;
if (lc_put(mdev->resync, &bm_ext->lce) == 0)
mdev->resync_locked--;
wake_up(&mdev->al_wait);
} else {
dev_alert(DEV, "LOGIC BUG\n");
}
}
/* TRY. */
e = lc_try_get(mdev->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext) {
if (test_bit(BME_LOCKED, &bm_ext->flags))
goto proceed;
if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
mdev->resync_locked++;
} else {
/* we did set the BME_NO_WRITES,
* but then could not set BME_LOCKED,
* so we tried again.
* drop the extra reference. */
trace_drbd_resync(mdev, TRACE_LVL_ALL,
"dropping extra reference on %u\n", enr);
bm_ext->lce.refcnt--;
D_ASSERT(bm_ext->lce.refcnt > 0);
}
goto check_al;
} else {
/* do we rather want to try later? */
if (mdev->resync_locked > mdev->resync->nr_elements-3) {
trace_drbd_resync(mdev, TRACE_LVL_ALL,
"resync_locked = %u!\n", mdev->resync_locked);
goto try_again;
}
/* Do or do not. There is no try. -- Yoda */
e = lc_get(mdev->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (!bm_ext) {
const unsigned long rs_flags = mdev->resync->flags;
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
BUG_ON(rs_flags & LC_DIRTY);
goto try_again;
}
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
lc_changed(mdev->resync, &bm_ext->lce);
wake_up(&mdev->al_wait);
D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
}
set_bit(BME_NO_WRITES, &bm_ext->flags);
D_ASSERT(bm_ext->lce.refcnt == 1);
mdev->resync_locked++;
goto check_al;
}
check_al:
trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr);
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
if (unlikely(al_enr+i == mdev->act_log->new_number))
goto try_again;
if (lc_is_used(mdev->act_log, al_enr+i))
goto try_again;
}
set_bit(BME_LOCKED, &bm_ext->flags);
proceed:
mdev->resync_wenr = LC_FREE;
spin_unlock_irq(&mdev->al_lock);
return 0;
try_again:
trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr);
if (bm_ext)
mdev->resync_wenr = enr;
spin_unlock_irq(&mdev->al_lock);
return -EAGAIN;
}
void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector)
{
unsigned int enr = BM_SECT_TO_EXT(sector);
struct lc_element *e;
struct bm_extent *bm_ext;
unsigned long flags;
trace_drbd_resync(mdev, TRACE_LVL_ALL,
"drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n",
(long long)sector, enr);
spin_lock_irqsave(&mdev->al_lock, flags);
e = lc_find(mdev->resync, enr);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (!bm_ext) {
spin_unlock_irqrestore(&mdev->al_lock, flags);
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n");
return;
}
if (bm_ext->lce.refcnt == 0) {
spin_unlock_irqrestore(&mdev->al_lock, flags);
dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, "
"but refcnt is 0!?\n",
(unsigned long long)sector, enr);
return;
}
if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
clear_bit(BME_LOCKED, &bm_ext->flags);
clear_bit(BME_NO_WRITES, &bm_ext->flags);
mdev->resync_locked--;
wake_up(&mdev->al_wait);
}
spin_unlock_irqrestore(&mdev->al_lock, flags);
}
/**
* drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
* @mdev: DRBD device.
*/
void drbd_rs_cancel_all(struct drbd_conf *mdev)
{
trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n");
spin_lock_irq(&mdev->al_lock);
if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */
lc_reset(mdev->resync);
put_ldev(mdev);
}
mdev->resync_locked = 0;
mdev->resync_wenr = LC_FREE;
spin_unlock_irq(&mdev->al_lock);
wake_up(&mdev->al_wait);
}
/**
* drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
* @mdev: DRBD device.
*
* Returns 0 upon success, -EAGAIN if at least one reference count was
* not zero.
*/
int drbd_rs_del_all(struct drbd_conf *mdev)
{
struct lc_element *e;
struct bm_extent *bm_ext;
int i;
trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n");
spin_lock_irq(&mdev->al_lock);
if (get_ldev_if_state(mdev, D_FAILED)) {
/* ok, ->resync is there. */
for (i = 0; i < mdev->resync->nr_elements; i++) {
e = lc_element_by_index(mdev->resync, i);
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (bm_ext->lce.lc_number == LC_FREE)
continue;
if (bm_ext->lce.lc_number == mdev->resync_wenr) {
dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently"
" got 'synced' by application io\n",
mdev->resync_wenr);
D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags));
clear_bit(BME_NO_WRITES, &bm_ext->flags);
mdev->resync_wenr = LC_FREE;
lc_put(mdev->resync, &bm_ext->lce);
}
if (bm_ext->lce.refcnt != 0) {
dev_info(DEV, "Retrying drbd_rs_del_all() later. "
"refcnt=%d\n", bm_ext->lce.refcnt);
put_ldev(mdev);
spin_unlock_irq(&mdev->al_lock);
return -EAGAIN;
}
D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags));
D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags));
lc_del(mdev->resync, &bm_ext->lce);
}
D_ASSERT(mdev->resync->used == 0);
put_ldev(mdev);
}
spin_unlock_irq(&mdev->al_lock);
return 0;
}
/**
* drbd_rs_failed_io() - Record information on a failure to resync the specified blocks
* @mdev: DRBD device.
* @sector: The sector number.
* @size: Size of failed IO operation, in byte.
*/
void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size)
{
/* Is called from worker and receiver context _only_ */
unsigned long sbnr, ebnr, lbnr;
unsigned long count;
sector_t esector, nr_sectors;
int wake_up = 0;
trace_drbd_resync(mdev, TRACE_LVL_SUMMARY,
"drbd_rs_failed_io: sector=%llus, size=%u\n",
(unsigned long long)sector, size);
if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) {
dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
ERR_IF(sector >= nr_sectors) return;
ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
/*
* round up start sector, round down end sector. we make sure we only
* handle full, aligned, BM_BLOCK_SIZE (4K) blocks */
if (unlikely(esector < BM_SECT_PER_BIT-1))
return;
if (unlikely(esector == (nr_sectors-1)))
ebnr = lbnr;
else
ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
if (sbnr > ebnr)
return;
/*
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
spin_lock_irq(&mdev->al_lock);
count = drbd_bm_count_bits(mdev, sbnr, ebnr);
if (count) {
mdev->rs_failed += count;
if (get_ldev(mdev)) {
drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE);
put_ldev(mdev);
}
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
}
spin_unlock_irq(&mdev->al_lock);
if (wake_up)
wake_up(&mdev->al_wait);
}
/*
drbd_bitmap.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/bitops.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/drbd.h>
#include <asm/kmap_types.h>
#include "drbd_int.h"
/* OPAQUE outside this file!
* interface defined in drbd_int.h
* convention:
* function name drbd_bm_... => used elsewhere, "public".
* function name bm_... => internal to implementation, "private".
* Note that since find_first_bit returns int, at the current granularity of
* the bitmap (4KB per byte), this implementation "only" supports up to
* 1<<(32+12) == 16 TB...
*/
/*
* NOTE
* Access to the *bm_pages is protected by bm_lock.
* It is safe to read the other members within the lock.
*
* drbd_bm_set_bits is called from bio_endio callbacks,
* We may be called with irq already disabled,
* so we need spin_lock_irqsave().
* And we need the kmap_atomic.
*/
struct drbd_bitmap {
struct page **bm_pages;
spinlock_t bm_lock;
/* WARNING unsigned long bm_*:
* 32bit number of bit offset is just enough for 512 MB bitmap.
* it will blow up if we make the bitmap bigger...
* not that it makes much sense to have a bitmap that large,
* rather change the granularity to 16k or 64k or something.
* (that implies other problems, however...)
*/
unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
unsigned long bm_bits;
size_t bm_words;
size_t bm_number_of_pages;
sector_t bm_dev_capacity;
struct semaphore bm_change; /* serializes resize operations */
atomic_t bm_async_io;
wait_queue_head_t bm_io_wait;
unsigned long bm_flags;
/* debugging aid, in case we are still racy somewhere */
char *bm_why;
struct task_struct *bm_task;
};
/* definition of bits in bm_flags */
#define BM_LOCKED 0
#define BM_MD_IO_ERROR 1
#define BM_P_VMALLOCED 2
static int bm_is_locked(struct drbd_bitmap *b)
{
return test_bit(BM_LOCKED, &b->bm_flags);
}
#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func)
{
struct drbd_bitmap *b = mdev->bitmap;
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
func, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
}
void drbd_bm_lock(struct drbd_conf *mdev, char *why)
{
struct drbd_bitmap *b = mdev->bitmap;
int trylock_failed;
if (!b) {
dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n");
return;
}
trylock_failed = down_trylock(&b->bm_change);
if (trylock_failed) {
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
current == mdev->receiver.task ? "receiver" :
current == mdev->asender.task ? "asender" :
current == mdev->worker.task ? "worker" : current->comm,
why, b->bm_why ?: "?",
b->bm_task == mdev->receiver.task ? "receiver" :
b->bm_task == mdev->asender.task ? "asender" :
b->bm_task == mdev->worker.task ? "worker" : "?");
down(&b->bm_change);
}
if (__test_and_set_bit(BM_LOCKED, &b->bm_flags))
dev_err(DEV, "FIXME bitmap already locked in bm_lock\n");
b->bm_why = why;
b->bm_task = current;
}
void drbd_bm_unlock(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
if (!b) {
dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n");
return;
}
if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags))
dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n");
b->bm_why = NULL;
b->bm_task = NULL;
up(&b->bm_change);
}
/* word offset to long pointer */
static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km)
{
struct page *page;
unsigned long page_nr;
/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
BUG_ON(page_nr >= b->bm_number_of_pages);
page = b->bm_pages[page_nr];
return (unsigned long *) kmap_atomic(page, km);
}
static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset)
{
return __bm_map_paddr(b, offset, KM_IRQ1);
}
static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
{
kunmap_atomic(p_addr, km);
};
static void bm_unmap(unsigned long *p_addr)
{
return __bm_unmap(p_addr, KM_IRQ1);
}
/* long word offset of _bitmap_ sector */
#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
/* word offset from start of bitmap to word number _in_page_
* modulo longs per page
#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
hm, well, Philipp thinks gcc might not optimze the % into & (... - 1)
so do it explicitly:
*/
#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
/* Long words per page */
#define LWPP (PAGE_SIZE/sizeof(long))
/*
* actually most functions herein should take a struct drbd_bitmap*, not a
* struct drbd_conf*, but for the debug macros I like to have the mdev around
* to be able to report device specific.
*/
static void bm_free_pages(struct page **pages, unsigned long number)
{
unsigned long i;
if (!pages)
return;
for (i = 0; i < number; i++) {
if (!pages[i]) {
printk(KERN_ALERT "drbd: bm_free_pages tried to free "
"a NULL pointer; i=%lu n=%lu\n",
i, number);
continue;
}
__free_page(pages[i]);
pages[i] = NULL;
}
}
static void bm_vk_free(void *ptr, int v)
{
if (v)
vfree(ptr);
else
kfree(ptr);
}
/*
* "have" and "want" are NUMBER OF PAGES.
*/
static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
{
struct page **old_pages = b->bm_pages;
struct page **new_pages, *page;
unsigned int i, bytes, vmalloced = 0;
unsigned long have = b->bm_number_of_pages;
BUG_ON(have == 0 && old_pages != NULL);
BUG_ON(have != 0 && old_pages == NULL);
if (have == want)
return old_pages;
/* Trying kmalloc first, falling back to vmalloc.
* GFP_KERNEL is ok, as this is done when a lower level disk is
* "attached" to the drbd. Context is receiver thread or cqueue
* thread. As we have no disk yet, we are not in the IO path,
* not even the IO path of the peer. */
bytes = sizeof(struct page *)*want;
new_pages = kmalloc(bytes, GFP_KERNEL);
if (!new_pages) {
new_pages = vmalloc(bytes);
if (!new_pages)
return NULL;
vmalloced = 1;
}
memset(new_pages, 0, bytes);
if (want >= have) {
for (i = 0; i < have; i++)
new_pages[i] = old_pages[i];
for (; i < want; i++) {
page = alloc_page(GFP_HIGHUSER);
if (!page) {
bm_free_pages(new_pages + have, i - have);
bm_vk_free(new_pages, vmalloced);
return NULL;
}
new_pages[i] = page;
}
} else {
for (i = 0; i < want; i++)
new_pages[i] = old_pages[i];
/* NOT HERE, we are outside the spinlock!
bm_free_pages(old_pages + want, have - want);
*/
}
if (vmalloced)
set_bit(BM_P_VMALLOCED, &b->bm_flags);
else
clear_bit(BM_P_VMALLOCED, &b->bm_flags);
return new_pages;
}
/*
* called on driver init only. TODO call when a device is created.
* allocates the drbd_bitmap, and stores it in mdev->bitmap.
*/
int drbd_bm_init(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
WARN_ON(b != NULL);
b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
if (!b)
return -ENOMEM;
spin_lock_init(&b->bm_lock);
init_MUTEX(&b->bm_change);
init_waitqueue_head(&b->bm_io_wait);
mdev->bitmap = b;
return 0;
}
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
{
ERR_IF(!mdev->bitmap) return 0;
return mdev->bitmap->bm_dev_capacity;
}
/* called on driver unload. TODO: call when a device is destroyed.
*/
void drbd_bm_cleanup(struct drbd_conf *mdev)
{
ERR_IF (!mdev->bitmap) return;
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags));
kfree(mdev->bitmap);
mdev->bitmap = NULL;
}
/*
* since (b->bm_bits % BITS_PER_LONG) != 0,
* this masks out the remaining bits.
* Returns the number of bits cleared.
*/
static int bm_clear_surplus(struct drbd_bitmap *b)
{
const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
size_t w = b->bm_bits >> LN2_BPL;
int cleared = 0;
unsigned long *p_addr, *bm;
p_addr = bm_map_paddr(b, w);
bm = p_addr + MLPP(w);
if (w < b->bm_words) {
cleared = hweight_long(*bm & ~mask);
*bm &= mask;
w++; bm++;
}
if (w < b->bm_words) {
cleared += hweight_long(*bm);
*bm = 0;
}
bm_unmap(p_addr);
return cleared;
}
static void bm_set_surplus(struct drbd_bitmap *b)
{
const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1;
size_t w = b->bm_bits >> LN2_BPL;
unsigned long *p_addr, *bm;
p_addr = bm_map_paddr(b, w);
bm = p_addr + MLPP(w);
if (w < b->bm_words) {
*bm |= ~mask;
bm++; w++;
}
if (w < b->bm_words) {
*bm = ~(0UL);
}
bm_unmap(p_addr);
}
static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian)
{
unsigned long *p_addr, *bm, offset = 0;
unsigned long bits = 0;
unsigned long i, do_now;
while (offset < b->bm_words) {
i = do_now = min_t(size_t, b->bm_words-offset, LWPP);
p_addr = __bm_map_paddr(b, offset, KM_USER0);
bm = p_addr + MLPP(offset);
while (i--) {
#ifndef __LITTLE_ENDIAN
if (swap_endian)
*bm = lel_to_cpu(*bm);
#endif
bits += hweight_long(*bm++);
}
__bm_unmap(p_addr, KM_USER0);
offset += do_now;
cond_resched();
}
return bits;
}
static unsigned long bm_count_bits(struct drbd_bitmap *b)
{
return __bm_count_bits(b, 0);
}
static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b)
{
return __bm_count_bits(b, 1);
}
/* offset and len in long words.*/
static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
{
unsigned long *p_addr, *bm;
size_t do_now, end;
#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512)
end = offset + len;
if (end > b->bm_words) {
printk(KERN_ALERT "drbd: bm_memset end > bm_words\n");
return;
}
while (offset < end) {
do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
p_addr = bm_map_paddr(b, offset);
bm = p_addr + MLPP(offset);
if (bm+do_now > p_addr + LWPP) {
printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
p_addr, bm, (int)do_now);
break; /* breaks to after catch_oob_access_end() only! */
}
memset(bm, c, do_now * sizeof(long));
bm_unmap(p_addr);
offset += do_now;
}
}
/*
* make sure the bitmap has enough room for the attached storage,
* if necessary, resize.
* called whenever we may have changed the device size.
* returns -ENOMEM if we could not allocate enough memory, 0 on success.
* In case this is actually a resize, we copy the old bitmap into the new one.
* Otherwise, the bitmap is initialized to all bits set.
*/
int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long bits, words, owords, obits, *p_addr, *bm;
unsigned long want, have, onpages; /* number of pages */
struct page **npages, **opages = NULL;
int err = 0, growing;
int opages_vmalloced;
ERR_IF(!b) return -ENOMEM;
drbd_bm_lock(mdev, "resize");
dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n",
(unsigned long long)capacity);
if (capacity == b->bm_dev_capacity)
goto out;
opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags);
if (capacity == 0) {
spin_lock_irq(&b->bm_lock);
opages = b->bm_pages;
onpages = b->bm_number_of_pages;
owords = b->bm_words;
b->bm_pages = NULL;
b->bm_number_of_pages =
b->bm_set =
b->bm_bits =
b->bm_words =
b->bm_dev_capacity = 0;
spin_unlock_irq(&b->bm_lock);
bm_free_pages(opages, onpages);
bm_vk_free(opages, opages_vmalloced);
goto out;
}
bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
/* if we would use
words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
a 32bit host could present the wrong number of words
to a 64bit host.
*/
words = ALIGN(bits, 64) >> LN2_BPL;
if (get_ldev(mdev)) {
D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12));
put_ldev(mdev);
}
/* one extra long to catch off by one errors */
want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
have = b->bm_number_of_pages;
if (want == have) {
D_ASSERT(b->bm_pages != NULL);
npages = b->bm_pages;
} else {
if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC))
npages = NULL;
else
npages = bm_realloc_pages(b, want);
}
if (!npages) {
err = -ENOMEM;
goto out;
}
spin_lock_irq(&b->bm_lock);
opages = b->bm_pages;
owords = b->bm_words;
obits = b->bm_bits;
growing = bits > obits;
if (opages)
bm_set_surplus(b);
b->bm_pages = npages;
b->bm_number_of_pages = want;
b->bm_bits = bits;
b->bm_words = words;
b->bm_dev_capacity = capacity;
if (growing) {
bm_memset(b, owords, 0xff, words-owords);
b->bm_set += bits - obits;
}
if (want < have) {
/* implicit: (opages != NULL) && (opages != npages) */
bm_free_pages(opages + want, have - want);
}
p_addr = bm_map_paddr(b, words);
bm = p_addr + MLPP(words);
*bm = DRBD_MAGIC;
bm_unmap(p_addr);
(void)bm_clear_surplus(b);
spin_unlock_irq(&b->bm_lock);
if (opages != npages)
bm_vk_free(opages, opages_vmalloced);
if (!growing)
b->bm_set = bm_count_bits(b);
dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words);
out:
drbd_bm_unlock(mdev);
return err;
}
/* inherently racy:
* if not protected by other means, return value may be out of date when
* leaving this function...
* we still need to lock it, since it is important that this returns
* bm_set == 0 precisely.
*
* maybe bm_set should be atomic_t ?
*/
static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long s;
unsigned long flags;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
spin_unlock_irqrestore(&b->bm_lock, flags);
return s;
}
unsigned long drbd_bm_total_weight(struct drbd_conf *mdev)
{
unsigned long s;
/* if I don't have a disk, I don't know about out-of-sync status */
if (!get_ldev_if_state(mdev, D_NEGOTIATING))
return 0;
s = _drbd_bm_total_weight(mdev);
put_ldev(mdev);
return s;
}
size_t drbd_bm_words(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
return b->bm_words;
}
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return 0;
return b->bm_bits;
}
/* merge number words from buffer into the bitmap starting at offset.
* buffer[i] is expected to be little endian unsigned long.
* bitmap must be locked by drbd_bm_lock.
* currently only used from receive_bitmap.
*/
void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number,
unsigned long *buffer)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr, *bm;
unsigned long word, bits;
size_t end, do_now;
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
WARN_ON(end > b->bm_words);
spin_lock_irq(&b->bm_lock);
while (offset < end) {
do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
p_addr = bm_map_paddr(b, offset);
bm = p_addr + MLPP(offset);
offset += do_now;
while (do_now--) {
bits = hweight_long(*bm);
word = *bm | lel_to_cpu(*buffer++);
*bm++ = word;
b->bm_set += hweight_long(word) - bits;
}
bm_unmap(p_addr);
}
/* with 32bit <-> 64bit cross-platform connect
* this is only correct for current usage,
* where we _know_ that we are 64 bit aligned,
* and know that this function is used in this way, too...
*/
if (end == b->bm_words)
b->bm_set -= bm_clear_surplus(b);
spin_unlock_irq(&b->bm_lock);
}
/* copy number words from the bitmap starting at offset into the buffer.
* buffer[i] will be little endian unsigned long.
*/
void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number,
unsigned long *buffer)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr, *bm;
size_t end, do_now;
end = offset + number;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
(end > b->bm_words) ||
(number <= 0))
dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n",
(unsigned long) offset,
(unsigned long) number,
(unsigned long) b->bm_words);
else {
while (offset < end) {
do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
p_addr = bm_map_paddr(b, offset);
bm = p_addr + MLPP(offset);
offset += do_now;
while (do_now--)
*buffer++ = cpu_to_lel(*bm++);
bm_unmap(p_addr);
}
}
spin_unlock_irq(&b->bm_lock);
}
/* set all bits in the bitmap */
void drbd_bm_set_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
(void)bm_clear_surplus(b);
b->bm_set = b->bm_bits;
spin_unlock_irq(&b->bm_lock);
}
/* clear all bits in the bitmap */
void drbd_bm_clear_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
ERR_IF(!b) return;
ERR_IF(!b->bm_pages) return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
b->bm_set = 0;
spin_unlock_irq(&b->bm_lock);
}
static void bm_async_io_complete(struct bio *bio, int error)
{
struct drbd_bitmap *b = bio->bi_private;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?!
* do we want to WARN() on this? */
if (!error && !uptodate)
error = -EIO;
if (error) {
/* doh. what now?
* for now, set all bits, and flag MD_IO_ERROR */
__set_bit(BM_MD_IO_ERROR, &b->bm_flags);
}
if (atomic_dec_and_test(&b->bm_async_io))
wake_up(&b->bm_io_wait);
bio_put(bio);
}
static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local)
{
/* we are process context. we always get a bio */
struct bio *bio = bio_alloc(GFP_KERNEL, 1);
unsigned int len;
sector_t on_disk_sector =
mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset;
on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
/* this might happen with very small
* flexible external meta data device */
len = min_t(unsigned int, PAGE_SIZE,
(drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9);
bio->bi_bdev = mdev->ldev->md_bdev;
bio->bi_sector = on_disk_sector;
bio_add_page(bio, b->bm_pages[page_nr], len, 0);
bio->bi_private = b;
bio->bi_end_io = bm_async_io_complete;
if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
bio->bi_rw |= rw;
bio_endio(bio, -EIO);
} else {
submit_bio(rw, bio);
}
}
# if defined(__LITTLE_ENDIAN)
/* nothing to do, on disk == in memory */
# define bm_cpu_to_lel(x) ((void)0)
# else
void bm_cpu_to_lel(struct drbd_bitmap *b)
{
/* need to cpu_to_lel all the pages ...
* this may be optimized by using
* cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0;
* the following is still not optimal, but better than nothing */
unsigned int i;
unsigned long *p_addr, *bm;
if (b->bm_set == 0) {
/* no page at all; avoid swap if all is 0 */
i = b->bm_number_of_pages;
} else if (b->bm_set == b->bm_bits) {
/* only the last page */
i = b->bm_number_of_pages - 1;
} else {
/* all pages */
i = 0;
}
for (; i < b->bm_number_of_pages; i++) {
p_addr = kmap_atomic(b->bm_pages[i], KM_USER0);
for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++)
*bm = cpu_to_lel(*bm);
kunmap_atomic(p_addr, KM_USER0);
}
}
# endif
/* lel_to_cpu == cpu_to_lel */
# define bm_lel_to_cpu(x) bm_cpu_to_lel(x)
/*
* bm_rw: read/write the whole bitmap from/to its on disk location.
*/
static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local)
{
struct drbd_bitmap *b = mdev->bitmap;
/* sector_t sector; */
int bm_words, num_pages, i;
unsigned long now;
char ppb[10];
int err = 0;
WARN_ON(!bm_is_locked(b));
/* no spinlock here, the drbd_bm_lock should be enough! */
bm_words = drbd_bm_words(mdev);
num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT;
/* on disk bitmap is little endian */
if (rw == WRITE)
bm_cpu_to_lel(b);
now = jiffies;
atomic_set(&b->bm_async_io, num_pages);
__clear_bit(BM_MD_IO_ERROR, &b->bm_flags);
/* let the layers below us try to merge these bios... */
for (i = 0; i < num_pages; i++)
bm_page_io_async(mdev, b, i, rw);
drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev));
wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0);
if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
drbd_chk_io_error(mdev, 1, TRUE);
err = -EIO;
}
now = jiffies;
if (rw == WRITE) {
/* swap back endianness */
bm_lel_to_cpu(b);
/* flush bitmap to stable storage */
drbd_md_flush(mdev);
} else /* rw == READ */ {
/* just read, if necessary adjust endianness */
b->bm_set = bm_count_bits_swap_endian(b);
dev_info(DEV, "recounting of set bits took additional %lu jiffies\n",
jiffies - now);
}
now = b->bm_set;
dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
return err;
}
/**
* drbd_bm_read() - Read the whole bitmap from its on disk location.
* @mdev: DRBD device.
*/
int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
{
return bm_rw(mdev, READ);
}
/**
* drbd_bm_write() - Write the whole bitmap to its on disk location.
* @mdev: DRBD device.
*/
int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
{
return bm_rw(mdev, WRITE);
}
/**
* drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap
* @mdev: DRBD device.
* @enr: Extent number in the resync lru (happens to be sector offset)
*
* The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered
* by a single sector write. Therefore enr == sector offset from the
* start of the bitmap.
*/
int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local)
{
sector_t on_disk_sector = enr + mdev->ldev->md.md_offset
+ mdev->ldev->md.bm_offset;
int bm_words, num_words, offset;
int err = 0;
mutex_lock(&mdev->md_io_mutex);
bm_words = drbd_bm_words(mdev);
offset = S2W(enr); /* word offset into bitmap */
num_words = min(S2W(1), bm_words - offset);
if (num_words < S2W(1))
memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE);
drbd_bm_get_lel(mdev, offset, num_words,
page_address(mdev->md_io_page));
if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) {
int i;
err = -EIO;
dev_err(DEV, "IO ERROR writing bitmap sector %lu "
"(meta-disk sector %llus)\n",
enr, (unsigned long long)on_disk_sector);
drbd_chk_io_error(mdev, 1, TRUE);
for (i = 0; i < AL_EXT_PER_BM_SECT; i++)
drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i);
}
mdev->bm_writ_cnt++;
mutex_unlock(&mdev->md_io_mutex);
return err;
}
/* NOTE
* find_first_bit returns int, we return unsigned long.
* should not make much difference anyways, but ...
*
* this returns a bit number, NOT a sector!
*/
#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1)
static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
const int find_zero_bit, const enum km_type km)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = -1UL;
unsigned long *p_addr;
unsigned long bit_offset; /* bit offset of the mapped page. */
if (bm_fo > b->bm_bits) {
dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
} else {
while (bm_fo < b->bm_bits) {
unsigned long offset;
bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */
offset = bit_offset >> LN2_BPL; /* word offset of the page */
p_addr = __bm_map_paddr(b, offset, km);
if (find_zero_bit)
i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
else
i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK);
__bm_unmap(p_addr, km);
if (i < PAGE_SIZE*8) {
i = bit_offset + i;
if (i >= b->bm_bits)
break;
goto found;
}
bm_fo = bit_offset + PAGE_SIZE*8;
}
i = -1UL;
}
found:
return i;
}
static unsigned long bm_find_next(struct drbd_conf *mdev,
unsigned long bm_fo, const int find_zero_bit)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = -1UL;
ERR_IF(!b) return i;
ERR_IF(!b->bm_pages) return i;
spin_lock_irq(&b->bm_lock);
if (bm_is_locked(b))
bm_print_lock_info(mdev);
i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
spin_unlock_irq(&b->bm_lock);
return i;
}
unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
{
return bm_find_next(mdev, bm_fo, 0);
}
#if 0
/* not yet needed for anything. */
unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
{
return bm_find_next(mdev, bm_fo, 1);
}
#endif
/* does not spin_lock_irqsave.
* you must take drbd_bm_lock() first */
unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
{
/* WARN_ON(!bm_is_locked(mdev)); */
return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
}
unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
{
/* WARN_ON(!bm_is_locked(mdev)); */
return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
}
/* returns number of bits actually changed.
* for val != 0, we change 0 -> 1, return code positive
* for val == 0, we change 1 -> 0, return code negative
* wants bitnr, not sector.
* expected to be called for only a few bits (e - s about BITS_PER_LONG).
* Must hold bitmap lock already. */
int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
unsigned long e, int val, const enum km_type km)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr = NULL;
unsigned long bitnr;
unsigned long last_page_nr = -1UL;
int c = 0;
if (e >= b->bm_bits) {
dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
s, e, b->bm_bits);
e = b->bm_bits ? b->bm_bits -1 : 0;
}
for (bitnr = s; bitnr <= e; bitnr++) {
unsigned long offset = bitnr>>LN2_BPL;
unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3);
if (page_nr != last_page_nr) {
if (p_addr)
__bm_unmap(p_addr, km);
p_addr = __bm_map_paddr(b, offset, km);
last_page_nr = page_nr;
}
if (val)
c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr));
else
c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr));
}
if (p_addr)
__bm_unmap(p_addr, km);
b->bm_set += c;
return c;
}
/* returns number of bits actually changed.
* for val != 0, we change 0 -> 1, return code positive
* for val == 0, we change 1 -> 0, return code negative
* wants bitnr, not sector */
int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
const unsigned long e, int val)
{
unsigned long flags;
struct drbd_bitmap *b = mdev->bitmap;
int c = 0;
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (bm_is_locked(b))
bm_print_lock_info(mdev);
c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1);
spin_unlock_irqrestore(&b->bm_lock, flags);
return c;
}
/* returns number of bits changed 0 -> 1 */
int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
{
return bm_change_bits_to(mdev, s, e, 1);
}
/* returns number of bits changed 1 -> 0 */
int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
{
return -bm_change_bits_to(mdev, s, e, 0);
}
/* sets all bits in full words,
* from first_word up to, but not including, last_word */
static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
int page_nr, int first_word, int last_word)
{
int i;
int bits;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
b->bm_set += BITS_PER_LONG - bits;
}
kunmap_atomic(paddr, KM_USER0);
}
/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave.
* You must first drbd_bm_lock().
* Can be called to set the whole bitmap in one go.
* Sets bits from s to e _inclusive_. */
void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
{
/* First set_bit from the first bit (s)
* up to the next long boundary (sl),
* then assign full words up to the last long boundary (el),
* then set_bit up to and including the last bit (e).
*
* Do not use memset, because we must account for changes,
* so we need to loop over the words with hweight() anyways.
*/
unsigned long sl = ALIGN(s,BITS_PER_LONG);
unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
int first_page;
int last_page;
int page_nr;
int first_word;
int last_word;
if (e - s <= 3*BITS_PER_LONG) {
/* don't bother; el and sl may even be wrong. */
__bm_change_bits_to(mdev, s, e, 1, KM_USER0);
return;
}
/* difference is large enough that we can trust sl and el */
/* bits filling the current long */
if (sl)
__bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0);
first_page = sl >> (3 + PAGE_SHIFT);
last_page = el >> (3 + PAGE_SHIFT);
/* MLPP: modulo longs per page */
/* LWPP: long words per page */
first_word = MLPP(sl >> LN2_BPL);
last_word = LWPP;
/* first and full pages, unless first page == last page */
for (page_nr = first_page; page_nr < last_page; page_nr++) {
bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word);
cond_resched();
first_word = 0;
}
/* last page (respectively only page, for first page == last page) */
last_word = MLPP(el >> LN2_BPL);
bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word);
/* possibly trailing bits.
* example: (e & 63) == 63, el will be e+1.
* if that even was the very last bit,
* it would trigger an assert in __bm_change_bits_to()
*/
if (el <= e)
__bm_change_bits_to(mdev, el, e, 1, KM_USER0);
}
/* returns bit state
* wants bitnr, NOT sector.
* inherently racy... area needs to be locked by means of {al,rs}_lru
* 1 ... bit set
* 0 ... bit not set
* -1 ... first out of bounds access, stop testing for bits!
*/
int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr)
{
unsigned long flags;
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr;
int i;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (bm_is_locked(b))
bm_print_lock_info(mdev);
if (bitnr < b->bm_bits) {
unsigned long offset = bitnr>>LN2_BPL;
p_addr = bm_map_paddr(b, offset);
i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0;
bm_unmap(p_addr);
} else if (bitnr == b->bm_bits) {
i = -1;
} else { /* (bitnr > b->bm_bits) */
dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
i = 0;
}
spin_unlock_irqrestore(&b->bm_lock, flags);
return i;
}
/* returns number of bits set in the range [s, e] */
int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e)
{
unsigned long flags;
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr = NULL, page_nr = -1;
unsigned long bitnr;
int c = 0;
size_t w;
/* If this is called without a bitmap, that is a bug. But just to be
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
ERR_IF(!b) return 1;
ERR_IF(!b->bm_pages) return 1;
spin_lock_irqsave(&b->bm_lock, flags);
if (bm_is_locked(b))
bm_print_lock_info(mdev);
for (bitnr = s; bitnr <= e; bitnr++) {
w = bitnr >> LN2_BPL;
if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) {
page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3);
if (p_addr)
bm_unmap(p_addr);
p_addr = bm_map_paddr(b, w);
}
ERR_IF (bitnr >= b->bm_bits) {
dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
} else {
c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
}
}
if (p_addr)
bm_unmap(p_addr);
spin_unlock_irqrestore(&b->bm_lock, flags);
return c;
}
/* inherently racy...
* return value may be already out-of-date when this function returns.
* but the general usage is that this is only use during a cstate when bits are
* only cleared, not set, and typically only care for the case when the return
* value is zero, or we already "locked" this "bitmap extent" by other means.
*
* enr is bm-extent number, since we chose to name one sector (512 bytes)
* worth of the bitmap a "bitmap extent".
*
* TODO
* I think since we use it like a reference count, we should use the real
* reference count of some bitmap extent element from some lru instead...
*
*/
int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr)
{
struct drbd_bitmap *b = mdev->bitmap;
int count, s, e;
unsigned long flags;
unsigned long *p_addr, *bm;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (bm_is_locked(b))
bm_print_lock_info(mdev);
s = S2W(enr);
e = min((size_t)S2W(enr+1), b->bm_words);
count = 0;
if (s < b->bm_words) {
int n = e-s;
p_addr = bm_map_paddr(b, s);
bm = p_addr + MLPP(s);
while (n--)
count += hweight_long(*bm++);
bm_unmap(p_addr);
} else {
dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s);
}
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
/* set all bits covered by the AL-extent al_enr */
unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long *p_addr, *bm;
unsigned long weight;
int count, s, e, i, do_now;
ERR_IF(!b) return 0;
ERR_IF(!b->bm_pages) return 0;
spin_lock_irq(&b->bm_lock);
if (bm_is_locked(b))
bm_print_lock_info(mdev);
weight = b->bm_set;
s = al_enr * BM_WORDS_PER_AL_EXT;
e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
/* assert that s and e are on the same page */
D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
== s >> (PAGE_SHIFT - LN2_BPL + 3));
count = 0;
if (s < b->bm_words) {
i = do_now = e-s;
p_addr = bm_map_paddr(b, s);
bm = p_addr + MLPP(s);
while (i--) {
count += hweight_long(*bm);
*bm = -1UL;
bm++;
}
bm_unmap(p_addr);
b->bm_set += do_now*BITS_PER_LONG - count;
if (e == b->bm_words)
b->bm_set -= bm_clear_surplus(b);
} else {
dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s);
}
weight = b->bm_set - weight;
spin_unlock_irq(&b->bm_lock);
return weight;
}
/*
drbd_int.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_INT_H
#define _DRBD_INT_H
#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/version.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/crypto.h>
#include <linux/tcp.h>
#include <linux/mutex.h>
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
#include <net/tcp.h>
#include <linux/lru_cache.h>
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
#else
# define __protected_by(x)
# define __protected_read_by(x)
# define __protected_write_by(x)
# define __must_hold(x)
#endif
#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0)
/* module parameter, defined in drbd_main.c */
extern unsigned int minor_count;
extern int disable_sendpage;
extern int allow_oos;
extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
extern int fault_rate;
extern int fault_devs;
#endif
extern char usermode_helper[];
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
/* I don't remember why XCPU ...
* This is used to wake the asender,
* and to interrupt sending the sending task
* on disconnect.
*/
#define DRBD_SIG SIGXCPU
/* This is used to stop/restart our threads.
* Cannot use SIGTERM nor SIGKILL, since these
* are sent out by init on runlevel changes
* I choose SIGHUP for now.
*/
#define DRBD_SIGKILL SIGHUP
/* All EEs on the free list should have ID_VACANT (== 0)
* freshly allocated EEs get !ID_VACANT (== 1)
* so if it says "cannot dereference null pointer at adress 0x00000001",
* it is most likely one of these :( */
#define ID_IN_SYNC (4711ULL)
#define ID_OUT_OF_SYNC (4712ULL)
#define ID_SYNCER (-1ULL)
#define ID_VACANT 0
#define is_syncer_block_id(id) ((id) == ID_SYNCER)
struct drbd_conf;
/* to shorten dev_warn(DEV, "msg"); and relatives statements */
#define DEV (disk_to_dev(mdev->vdisk))
#define D_ASSERT(exp) if (!(exp)) \
dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
#define ERR_IF(exp) if (({ \
int _b = (exp) != 0; \
if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \
__func__, #exp, __FILE__, __LINE__); \
_b; \
}))
/* Defines to control fault insertion */
enum {
DRBD_FAULT_MD_WR = 0, /* meta data write */
DRBD_FAULT_MD_RD = 1, /* read */
DRBD_FAULT_RS_WR = 2, /* resync */
DRBD_FAULT_RS_RD = 3,
DRBD_FAULT_DT_WR = 4, /* data */
DRBD_FAULT_DT_RD = 5,
DRBD_FAULT_DT_RA = 6, /* data read ahead */
DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
DRBD_FAULT_AL_EE = 8, /* alloc ee */
DRBD_FAULT_MAX,
};
extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...);
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern unsigned int
_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type);
static inline int
drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) {
return fault_rate &&
(enable_faults & (1<<type)) &&
_drbd_insert_fault(mdev, type);
}
#define FAULT_ACTIVE(_m, _t) (drbd_insert_fault((_m), (_t)))
#else
#define FAULT_ACTIVE(_m, _t) (0)
#endif
/* integer division, round _UP_ to the next integer */
#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
/* usual integer division */
#define div_floor(A, B) ((A)/(B))
/* drbd_meta-data.c (still in drbd_main.c) */
/* 4th incarnation of the disk layout. */
#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
extern struct drbd_conf **minor_table;
extern struct ratelimit_state drbd_ratelimit_state;
/* on the wire */
enum drbd_packets {
/* receiver (data socket) */
P_DATA = 0x00,
P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
P_BARRIER = 0x03,
P_BITMAP = 0x04,
P_BECOME_SYNC_TARGET = 0x05,
P_BECOME_SYNC_SOURCE = 0x06,
P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
P_SYNC_PARAM = 0x0a,
P_PROTOCOL = 0x0b,
P_UUIDS = 0x0c,
P_SIZES = 0x0d,
P_STATE = 0x0e,
P_SYNC_UUID = 0x0f,
P_AUTH_CHALLENGE = 0x10,
P_AUTH_RESPONSE = 0x11,
P_STATE_CHG_REQ = 0x12,
/* asender (meta socket */
P_PING = 0x13,
P_PING_ACK = 0x14,
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
P_BARRIER_ACK = 0x1c,
P_STATE_CHG_REPLY = 0x1d,
/* "new" commands, no longer fitting into the ordering scheme above */
P_OV_REQUEST = 0x1e, /* data socket */
P_OV_REPLY = 0x1f,
P_OV_RESULT = 0x20, /* meta socket */
P_CSUM_RS_REQUEST = 0x21, /* data socket */
P_RS_IS_IN_SYNC = 0x22, /* meta socket */
P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
P_MAX_CMD = 0x25,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
/* special command ids for handshake */
P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
};
static inline const char *cmdname(enum drbd_packets cmd)
{
/* THINK may need to become several global tables
* when we want to support more than
* one PRO_VERSION */
static const char *cmdnames[] = {
[P_DATA] = "Data",
[P_DATA_REPLY] = "DataReply",
[P_RS_DATA_REPLY] = "RSDataReply",
[P_BARRIER] = "Barrier",
[P_BITMAP] = "ReportBitMap",
[P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
[P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
[P_UNPLUG_REMOTE] = "UnplugRemote",
[P_DATA_REQUEST] = "DataRequest",
[P_RS_DATA_REQUEST] = "RSDataRequest",
[P_SYNC_PARAM] = "SyncParam",
[P_SYNC_PARAM89] = "SyncParam89",
[P_PROTOCOL] = "ReportProtocol",
[P_UUIDS] = "ReportUUIDs",
[P_SIZES] = "ReportSizes",
[P_STATE] = "ReportState",
[P_SYNC_UUID] = "ReportSyncUUID",
[P_AUTH_CHALLENGE] = "AuthChallenge",
[P_AUTH_RESPONSE] = "AuthResponse",
[P_PING] = "Ping",
[P_PING_ACK] = "PingAck",
[P_RECV_ACK] = "RecvAck",
[P_WRITE_ACK] = "WriteAck",
[P_RS_WRITE_ACK] = "RSWriteAck",
[P_DISCARD_ACK] = "DiscardAck",
[P_NEG_ACK] = "NegAck",
[P_NEG_DREPLY] = "NegDReply",
[P_NEG_RS_DREPLY] = "NegRSDReply",
[P_BARRIER_ACK] = "BarrierAck",
[P_STATE_CHG_REQ] = "StateChgRequest",
[P_STATE_CHG_REPLY] = "StateChgReply",
[P_OV_REQUEST] = "OVRequest",
[P_OV_REPLY] = "OVReply",
[P_OV_RESULT] = "OVResult",
[P_MAX_CMD] = NULL,
};
if (cmd == P_HAND_SHAKE_M)
return "HandShakeM";
if (cmd == P_HAND_SHAKE_S)
return "HandShakeS";
if (cmd == P_HAND_SHAKE)
return "HandShake";
if (cmd >= P_MAX_CMD)
return "Unknown";
return cmdnames[cmd];
}
/* for sending/receiving the bitmap,
* possibly in some encoding scheme */
struct bm_xfer_ctx {
/* "const"
* stores total bits and long words
* of the bitmap, so we don't need to
* call the accessor functions over and again. */
unsigned long bm_bits;
unsigned long bm_words;
/* during xfer, current position within the bitmap */
unsigned long bit_offset;
unsigned long word_offset;
/* statistics; index: (h->command == P_BITMAP) */
unsigned packets[2];
unsigned bytes[2];
};
extern void INFO_bm_xfer_stats(struct drbd_conf *mdev,
const char *direction, struct bm_xfer_ctx *c);
static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
{
/* word_offset counts "native long words" (32 or 64 bit),
* aligned at 64 bit.
* Encoded packet may end at an unaligned bit offset.
* In case a fallback clear text packet is transmitted in
* between, we adjust this offset back to the last 64bit
* aligned "native long word", which makes coding and decoding
* the plain text bitmap much more convenient. */
#if BITS_PER_LONG == 64
c->word_offset = c->bit_offset >> 6;
#elif BITS_PER_LONG == 32
c->word_offset = c->bit_offset >> 5;
c->word_offset &= ~(1UL);
#else
# error "unsupported BITS_PER_LONG"
#endif
}
#ifndef __packed
#define __packed __attribute__((packed))
#endif
/* This is the layout for a packet on the wire.
* The byteorder is the network byte order.
* (except block_id and barrier fields.
* these are pointers to local structs
* and have no relevance for the partner,
* which just echoes them as received.)
*
* NOTE that the payload starts at a long aligned offset,
* regardless of 32 or 64 bit arch!
*/
struct p_header {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
u8 payload[0];
} __packed;
/* 8 bytes. packet FIXED for the next century! */
/*
* short commands, packets without payload, plain p_header:
* P_PING
* P_PING_ACK
* P_BECOME_SYNC_TARGET
* P_BECOME_SYNC_SOURCE
* P_UNPLUG_REMOTE
*/
/*
* commands with out-of-struct payload:
* P_BITMAP (no additional fields)
* P_DATA, P_DATA_REPLY (see p_data)
* P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
*/
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1
#define DP_RW_SYNC 2
#define DP_MAY_SET_IN_SYNC 4
struct p_data {
struct p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
u32 dp_flags;
} __packed;
/*
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
* P_DISCARD_ACK (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
struct p_header head;
u64 sector;
u64 block_id;
u32 blksize;
u32 seq_num;
} __packed;
struct p_block_req {
struct p_header head;
u64 sector;
u64 block_id;
u32 blksize;
u32 pad; /* to multiple of 8 Byte */
} __packed;
/*
* commands with their own struct for additional fields:
* P_HAND_SHAKE
* P_BARRIER
* P_BARRIER_ACK
* P_SYNC_PARAM
* ReportParams
*/
struct p_handshake {
struct p_header head; /* 8 bytes */
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
/* should be more than enough for future enhancements
* for now, feature_flags and the reserverd array shall be zero.
*/
u32 _pad;
u64 reserverd[7];
} __packed;
/* 80 bytes, FIXED for the next century */
struct p_barrier {
struct p_header head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
struct p_header head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
struct p_header head;
u32 rate;
/* Since protocol version 88 and higher. */
char verify_alg[0];
} __packed;
struct p_rs_param_89 {
struct p_header head;
u32 rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
struct p_protocol {
struct p_header head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
u32 after_sb_2p;
u32 want_lose;
u32 two_primaries;
/* Since protocol version 87 and higher. */
char integrity_alg[0];
} __packed;
struct p_uuids {
struct p_header head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
struct p_header head;
u64 uuid;
} __packed;
struct p_sizes {
struct p_header head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
u32 max_segment_size; /* Maximal size of a BIO */
u32 queue_order_type;
} __packed;
struct p_state {
struct p_header head;
u32 state;
} __packed;
struct p_req_state {
struct p_header head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
struct p_header head;
u32 retcode;
} __packed;
struct p_drbd06_param {
u64 size;
u32 state;
u32 blksize;
u32 protocol;
u32 version;
u32 gen_cnt[5];
u32 bit_map_gen[5];
} __packed;
struct p_discard {
struct p_header head;
u64 block_id;
u32 seq_num;
u32 pad;
} __packed;
/* Valid values for the encoding field.
* Bump proto version when changing this. */
enum drbd_bitmap_code {
/* RLE_VLI_Bytes = 0,
* and other bit variants had been defined during
* algorithm evaluation. */
RLE_VLI_Bits = 2,
};
struct p_compressed_bm {
struct p_header head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
* used to pad up to head.length bytes
*/
u8 encoding;
u8 code[0];
} __packed;
/* DCBP: Drbd Compressed Bitmap Packet ... */
static inline enum drbd_bitmap_code
DCBP_get_code(struct p_compressed_bm *p)
{
return (enum drbd_bitmap_code)(p->encoding & 0x0f);
}
static inline void
DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
{
BUG_ON(code & ~0xf);
p->encoding = (p->encoding & ~0xf) | code;
}
static inline int
DCBP_get_start(struct p_compressed_bm *p)
{
return (p->encoding & 0x80) != 0;
}
static inline void
DCBP_set_start(struct p_compressed_bm *p, int set)
{
p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
}
static inline int
DCBP_get_pad_bits(struct p_compressed_bm *p)
{
return (p->encoding >> 4) & 0x7;
}
static inline void
DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
{
BUG_ON(n & ~0x7);
p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
}
/* one bitmap packet, including the p_header,
* should fit within one _architecture independend_ page.
* so we need to use the fixed size 4KiB page size
* most architechtures have used for a long time.
*/
#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
#if (PAGE_SIZE < 4096)
/* drbd_send_bitmap / receive_bitmap would break horribly */
#error "PAGE_SIZE too small"
#endif
union p_polymorph {
struct p_header header;
struct p_handshake handshake;
struct p_data data;
struct p_block_ack block_ack;
struct p_barrier barrier;
struct p_barrier_ack barrier_ack;
struct p_rs_param_89 rs_param_89;
struct p_protocol protocol;
struct p_sizes sizes;
struct p_uuids uuids;
struct p_state state;
struct p_req_state req_state;
struct p_req_state_reply req_state_reply;
struct p_block_req block_req;
} __packed;
/**********************************************************************/
enum drbd_thread_state {
None,
Running,
Exiting,
Restarting
};
struct drbd_thread {
spinlock_t t_lock;
struct task_struct *task;
struct completion stop;
enum drbd_thread_state t_state;
int (*function) (struct drbd_thread *);
struct drbd_conf *mdev;
int reset_cpu_mask;
};
static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
{
/* THINK testing the t_state seems to be uncritical in all cases
* (but thread_{start,stop}), so we can read it *without* the lock.
* --lge */
smp_rmb();
return thi->t_state;
}
/*
* Having this as the first member of a struct provides sort of "inheritance".
* "derived" structs can be "drbd_queue_work()"ed.
* The callback should know and cast back to the descendant struct.
* drbd_request and drbd_epoch_entry are descendants of drbd_work.
*/
struct drbd_work;
typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
struct drbd_work {
struct list_head list;
drbd_work_cb cb;
};
struct drbd_tl_epoch;
struct drbd_request {
struct drbd_work w;
struct drbd_conf *mdev;
/* if local IO is not allowed, will be NULL.
* if local IO _is_ allowed, holds the locally submitted bio clone,
* or, after local IO completion, the ERR_PTR(error).
* see drbd_endio_pri(). */
struct bio *private_bio;
struct hlist_node colision;
sector_t sector;
unsigned int size;
unsigned int epoch; /* barrier_nr */
/* barrier_nr: used to check on "completion" whether this req was in
* the current epoch, and we therefore have to close it,
* starting a new epoch...
*/
/* up to here, the struct layout is identical to drbd_epoch_entry;
* we might be able to use that to our advantage... */
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
unsigned long rq_state; /* see comments above _req_mod() */
int seq_num;
unsigned long start_time;
};
struct drbd_tl_epoch {
struct drbd_work w;
struct list_head requests; /* requests before */
struct drbd_tl_epoch *next; /* pointer to the next barrier */
unsigned int br_number; /* the barriers identifier. */
int n_req; /* number of requests attached before this barrier */
};
struct drbd_request;
/* These Tl_epoch_entries may be in one of 6 lists:
active_ee .. data packet being written
sync_ee .. syncer block being written
done_ee .. block written, need to send P_WRITE_ACK
read_ee .. [RS]P_DATA_REQUEST being read
*/
struct drbd_epoch {
struct list_head list;
unsigned int barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
atomic_t active; /* increased on every req. added, and dec on every finished. */
unsigned long flags;
};
/* drbd_epoch flag bits */
enum {
DE_BARRIER_IN_NEXT_EPOCH_ISSUED,
DE_BARRIER_IN_NEXT_EPOCH_DONE,
DE_CONTAINS_A_BARRIER,
DE_HAVE_BARRIER_NUMBER,
DE_IS_FINISHING,
};
enum epoch_event {
EV_PUT,
EV_GOT_BARRIER_NR,
EV_BARRIER_DONE,
EV_BECAME_LAST,
EV_TRACE_FLUSH, /* TRACE_ are not real events, only used for tracing */
EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */
EV_TRACE_SETTING_BI, /* Barrier is expressed with the first write of the next epoch */
EV_TRACE_ALLOC,
EV_TRACE_FREE,
EV_CLEANUP = 32, /* used as flag */
};
struct drbd_epoch_entry {
struct drbd_work w;
struct drbd_conf *mdev;
struct bio *private_bio;
struct hlist_node colision;
sector_t sector;
unsigned int size;
struct drbd_epoch *epoch;
/* up to here, the struct layout is identical to drbd_request;
* we might be able to use that to our advantage... */
unsigned int flags;
u64 block_id;
};
struct drbd_wq_barrier {
struct drbd_work w;
struct completion done;
};
struct digest_info {
int digest_size;
void *digest;
};
/* ee flag bits */
enum {
__EE_CALL_AL_COMPLETE_IO,
__EE_CONFLICT_PENDING,
__EE_MAY_SET_IN_SYNC,
__EE_IS_BARRIER,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
/* global flag bits */
enum {
CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
STOP_SYNC_TIMER, /* tell timer to cancel itself */
UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
CL_ST_CHG_SUCCESS,
CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary.
* Gets cleared when the state.conn
* goes into C_CONNECTED state. */
WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */
NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */
CONSIDER_RESYNC,
MD_NO_BARRIER, /* meta data device does not support barriers,
so don't even try */
SUSPEND_IO, /* suspend application io */
BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
CONFIG_PENDING, /* serialization of (re)configuration requests.
* if set, also prevents the device from dying */
DEVICE_DYING, /* device became unconfigured,
* but worker thread is still handling the cleanup.
* reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
* while this is set. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
};
struct drbd_bitmap; /* opaque for drbd_conf */
/* TODO sort members for performance
* MAYBE group them further */
/* THINK maybe we actually want to use the default "event/%s" worker threads
* or similar in linux 2.6, which uses per cpu data and threads.
*
* To be general, this might need a spin_lock member.
* For now, please use the mdev->req_lock to protect list_head,
* see drbd_queue_work below.
*/
struct drbd_work_queue {
struct list_head q;
struct semaphore s; /* producers up it, worker down()s it */
spinlock_t q_lock; /* to protect the list. */
};
struct drbd_socket {
struct drbd_work_queue work;
struct mutex mutex;
struct socket *socket;
/* this way we get our
* send/receive buffers off the stack */
union p_polymorph sbuf;
union p_polymorph rbuf;
};
struct drbd_md {
u64 md_offset; /* sector offset to 'super' block */
u64 la_size_sect; /* last agreed size, unit sectors */
u64 uuid[UI_SIZE];
u64 device_uuid;
u32 flags;
u32 md_size_sect;
s32 al_offset; /* signed relative sector offset to al area */
s32 bm_offset; /* signed relative sector offset to bitmap */
/* u32 al_nr_extents; important for restoring the AL
* is stored into sync_conf.al_extents, which in turn
* gets applied to act_log->nr_elements
*/
};
/* for sync_conf and other types... */
#define NL_PACKET(name, number, fields) struct name { fields };
#define NL_INTEGER(pn,pr,member) int member;
#define NL_INT64(pn,pr,member) __u64 member;
#define NL_BIT(pn,pr,member) unsigned member:1;
#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
#include "linux/drbd_nl.h"
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
struct file *lo_file;
struct file *md_file;
struct drbd_md md;
struct disk_conf dc; /* The user provided config... */
sector_t known_size; /* last known size of that backing device */
};
struct drbd_md_io {
struct drbd_conf *mdev;
struct completion event;
int error;
};
struct bm_io_work {
struct drbd_work w;
char *why;
int (*io_fn)(struct drbd_conf *mdev);
void (*done)(struct drbd_conf *mdev, int rv);
};
enum write_ordering_e {
WO_none,
WO_drain_io,
WO_bdev_flush,
WO_bio_barrier
};
struct drbd_conf {
/* things that are stored as / read from meta data on disk */
unsigned long flags;
/* configured by drbdsetup */
struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
struct syncer_conf sync_conf;
struct drbd_backing_dev *ldev __protected_by(local);
sector_t p_size; /* partner's disk size */
struct request_queue *rq_queue;
struct block_device *this_bdev;
struct gendisk *vdisk;
struct drbd_socket data; /* data/barrier/cstate/parameter packets */
struct drbd_socket meta; /* ping/ack (metadata) packets */
int agreed_pro_version; /* actually used protocol version */
unsigned long last_received; /* in jiffies, either socket */
unsigned int ko_count;
struct drbd_work resync_work,
unplug_work,
md_sync_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
union drbd_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
unsigned int writ_cnt;
unsigned int al_writ_cnt;
unsigned int bm_writ_cnt;
atomic_t ap_bio_cnt; /* Requests we need to complete */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
atomic_t unacked_cnt; /* Need to send replys for */
atomic_t local_cnt; /* Waiting for local completion */
atomic_t net_cnt; /* Users of net_conf */
spinlock_t req_lock;
struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
struct drbd_tl_epoch *newest_tle;
struct drbd_tl_epoch *oldest_tle;
struct list_head out_of_sequence_requests;
struct hlist_head *tl_hash;
unsigned int tl_hash_s;
/* blocks to sync in this run [unit BM_BLOCK_SIZE] */
unsigned long rs_total;
/* number of sync IOs that failed in this run */
unsigned long rs_failed;
/* Syncer's start time [unit jiffies] */
unsigned long rs_start;
/* cumulated time in PausedSyncX state [unit jiffies] */
unsigned long rs_paused;
/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
unsigned long rs_mark_left;
/* marks's time [unit jiffies] */
unsigned long rs_mark_time;
/* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
unsigned long rs_same_csum;
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
/* where are we now? (sector) */
sector_t ov_position;
/* Start sector of out of sync range (to merge printk reporting). */
sector_t ov_last_oos_start;
/* size of out-of-sync range in sectors. */
sector_t ov_last_oos_size;
unsigned long ov_left; /* in bits */
struct crypto_hash *csums_tfm;
struct crypto_hash *verify_tfm;
struct drbd_thread receiver;
struct drbd_thread worker;
struct drbd_thread asender;
struct drbd_bitmap *bitmap;
unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
/* Used to track operations of resync... */
struct lru_cache *resync;
/* Number of locked elements in resync LRU */
unsigned int resync_locked;
/* resync extent number waiting for application requests */
unsigned int resync_wenr;
int open_cnt;
u64 *p_uuid;
struct drbd_epoch *current_epoch;
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
struct list_head active_ee; /* IO in progress */
struct list_head sync_ee; /* IO in progress */
struct list_head done_ee; /* send ack */
struct list_head read_ee; /* IO in progress */
struct list_head net_ee; /* zero-copy network send in progress */
struct hlist_head *ee_hash; /* is proteced by req_lock! */
unsigned int ee_hash_s;
/* this one is protected by ee_lock, single thread */
struct drbd_epoch_entry *last_write_w_barrier;
int next_barrier_nr;
struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
atomic_t pp_in_use;
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
struct mutex md_io_mutex; /* protects the md_io_buffer */
spinlock_t al_lock;
wait_queue_head_t al_wait;
struct lru_cache *act_log; /* activity log */
unsigned int al_tr_number;
int al_tr_cycle;
int al_tr_pos; /* position of the next transaction in the journal */
struct crypto_hash *cram_hmac_tfm;
struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
void *int_dig_out;
void *int_dig_in;
void *int_dig_vv;
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
spinlock_t peer_seq_lock;
unsigned int minor;
unsigned long comm_bm_set; /* communicated number of set bits. */
cpumask_var_t cpu_mask;
struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
char congestion_reason; /* Why we where congested... */
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
{
struct drbd_conf *mdev;
mdev = minor < minor_count ? minor_table[minor] : NULL;
return mdev;
}
static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
{
return mdev->minor;
}
/* returns 1 if it was successfull,
* returns 0 if there was no data socket.
* so wherever you are going to use the data.socket, e.g. do
* if (!drbd_get_data_sock(mdev))
* return 0;
* CODE();
* drbd_put_data_sock(mdev);
*/
static inline int drbd_get_data_sock(struct drbd_conf *mdev)
{
mutex_lock(&mdev->data.mutex);
/* drbd_disconnect() could have called drbd_free_sock()
* while we were waiting in down()... */
if (unlikely(mdev->data.socket == NULL)) {
mutex_unlock(&mdev->data.mutex);
return 0;
}
return 1;
}
static inline void drbd_put_data_sock(struct drbd_conf *mdev)
{
mutex_unlock(&mdev->data.mutex);
}
/*
* function declarations
*************************/
/* drbd_main.c */
enum chg_state_flags {
CS_HARD = 1,
CS_VERBOSE = 2,
CS_WAIT_COMPLETE = 4,
CS_SERIALIZE = 8,
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
};
extern void drbd_init_set_defaults(struct drbd_conf *mdev);
extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
union drbd_state mask, union drbd_state val);
extern void drbd_force_state(struct drbd_conf *, union drbd_state,
union drbd_state);
extern int _drbd_request_state(struct drbd_conf *, union drbd_state,
union drbd_state, enum chg_state_flags);
extern int __drbd_set_state(struct drbd_conf *, union drbd_state,
enum chg_state_flags, struct completion *done);
extern void print_st_err(struct drbd_conf *, union drbd_state,
union drbd_state, int);
extern int drbd_thread_start(struct drbd_thread *thi);
extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
#ifdef CONFIG_SMP
extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
#else
#define drbd_thread_current_set_cpu(A) ({})
#define drbd_calc_cpu_mask(A) ({})
#endif
extern void drbd_free_resources(struct drbd_conf *mdev);
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
void *buf, size_t size, unsigned msg_flags);
extern int drbd_send_protocol(struct drbd_conf *mdev);
extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
enum drbd_packets cmd, struct p_header *h,
size_t size, unsigned msg_flags);
#define USE_DATA_SOCKET 1
#define USE_META_SOCKET 0
extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
enum drbd_packets cmd, struct p_header *h,
size_t size);
extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
char *data, size_t size);
extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
u32 set_size);
extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
struct drbd_epoch_entry *e);
extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_block_req *rp);
extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_data *dp);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
sector_t sector, int blksize, u64 block_id);
extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
struct drbd_epoch_entry *e);
extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
extern int _drbd_send_barrier(struct drbd_conf *mdev,
struct drbd_tl_epoch *barrier);
extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
sector_t sector, int size, u64 block_id);
extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
sector_t sector,int size,
void *digest, int digest_size,
enum drbd_packets cmd);
extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
extern int drbd_send_bitmap(struct drbd_conf *mdev);
extern int _drbd_send_bitmap(struct drbd_conf *mdev);
extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode);
extern void drbd_free_bc(struct drbd_backing_dev *ldev);
extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
/* drbd_meta-data.c (still in drbd_main.c) */
extern void drbd_md_sync(struct drbd_conf *mdev);
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
/* maybe define them below as inline? */
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local);
extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local);
extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
void (*done)(struct drbd_conf *, int),
char *why);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
/* Meta data layout
We reserve a 128MB Block (4k aligned)
* either at the end of the backing device
* or on a seperate meta data device. */
#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
/* The following numbers are sectors */
#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
/* Allows up to about 3.8TB */
#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
/* Since the smalles IO unit is usually 512 byte */
#define MD_SECTOR_SHIFT 9
#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
/* activity log */
#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
#if BITS_PER_LONG == 32
#define LN2_BPL 5
#define cpu_to_lel(A) cpu_to_le32(A)
#define lel_to_cpu(A) le32_to_cpu(A)
#elif BITS_PER_LONG == 64
#define LN2_BPL 6
#define cpu_to_lel(A) cpu_to_le64(A)
#define lel_to_cpu(A) le64_to_cpu(A)
#else
#error "LN2 of BITS_PER_LONG unknown!"
#endif
/* resync bitmap */
/* 16MB sized 'bitmap extent' to track syncer usage */
struct bm_extent {
int rs_left; /* number of bits set (out of sync) in this extent. */
int rs_failed; /* number of failed resync requests in this extent. */
unsigned long flags;
struct lc_element lce;
};
#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
/* drbd_bitmap.c */
/*
* We need to store one bit for a block.
* Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
* Bit 0 ==> local node thinks this block is binary identical on both nodes
* Bit 1 ==> local node thinks this block needs to be synced.
*/
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
* per sector of on disk bitmap */
#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
#error "HAVE YOU FIXED drbdmeta AS WELL??"
#endif
/* thus many _storage_ sectors are described by one bit */
#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
/* bit to represented kilo byte conversion */
#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
/* in which _bitmap_ extent (resp. sector) the bit for a certain
* _storage_ sector is located in */
#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
/* how much _storage_ sectors we have per bitmap sector */
#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
/* in one sector of the bitmap, we have this many activity_log extents. */
#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT)
#define BM_BLOCKS_PER_BM_EXT_MASK ((1<<BM_BLOCKS_PER_BM_EXT_B) - 1)
/* the extent in "PER_EXTENT" below is an activity log extent
* we need that many (long words/bytes) to store the bitmap
* of one AL_EXTENT_SIZE chunk of storage.
* we can store the bitmap for that many AL_EXTENTS within
* one sector of the _on_disk_ bitmap:
* bit 0 bit 37 bit 38 bit (512*8)-1
* ...|........|........|.. // ..|........|
* sect. 0 `296 `304 ^(512*8*8)-1
*
#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
*/
#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
#define DRBD_MAX_SECTORS_BM \
((MD_RESERVED_SECT - MD_BM_OFFSET) * (1LL<<(BM_EXT_SHIFT-9)))
#if DRBD_MAX_SECTORS_BM < DRBD_MAX_SECTORS_32
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_BM
#elif !defined(CONFIG_LBD) && BITS_PER_LONG == 32
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
#else
#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM
/* 16 TB in units of sectors */
#if BITS_PER_LONG == 32
/* adjust by one page worth of bitmap,
* so we won't wrap around in drbd_bm_find_next_bit.
* you should use 64bit OS for that much storage, anyways. */
#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
#else
#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0x1LU << 32)
#endif
#endif
/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
* With a value of 6 all IO in one 32K block make it to the same slot of the
* hash table. */
#define HT_SHIFT 6
#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
extern int drbd_bm_init(struct drbd_conf *mdev);
extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
extern void drbd_bm_cleanup(struct drbd_conf *mdev);
extern void drbd_bm_set_all(struct drbd_conf *mdev);
extern void drbd_bm_clear_all(struct drbd_conf *mdev);
extern int drbd_bm_set_bits(
struct drbd_conf *mdev, unsigned long s, unsigned long e);
extern int drbd_bm_clear_bits(
struct drbd_conf *mdev, unsigned long s, unsigned long e);
/* bm_set_bits variant for use while holding drbd_bm_lock */
extern void _drbd_bm_set_bits(struct drbd_conf *mdev,
const unsigned long s, const unsigned long e);
extern int drbd_bm_test_bit(struct drbd_conf *mdev, unsigned long bitnr);
extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
extern int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local);
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
unsigned long al_enr);
extern size_t drbd_bm_words(struct drbd_conf *mdev);
extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
/* bm_find_next variants for use while you hold drbd_bm_lock() */
extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
extern int drbd_bm_rs_done(struct drbd_conf *mdev);
/* for receive_bitmap */
extern void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset,
size_t number, unsigned long *buffer);
/* for _drbd_send_bitmap and drbd_bm_write_sect */
extern void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset,
size_t number, unsigned long *buffer);
extern void drbd_bm_lock(struct drbd_conf *mdev, char *why);
extern void drbd_bm_unlock(struct drbd_conf *mdev);
extern int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e);
/* drbd_main.c */
extern struct kmem_cache *drbd_request_cache;
extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
extern mempool_t *drbd_ee_mempool;
extern struct page *drbd_pp_pool; /* drbd's page pool */
extern spinlock_t drbd_pp_lock;
extern int drbd_pp_vacant;
extern wait_queue_head_t drbd_pp_wait;
extern rwlock_t global_state_lock;
extern struct drbd_conf *drbd_new_device(unsigned int minor);
extern void drbd_free_mdev(struct drbd_conf *mdev);
extern int proc_details;
/* drbd_req */
extern int drbd_make_request_26(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
/* drbd_nl.c */
extern void drbd_suspend_io(struct drbd_conf *mdev);
extern void drbd_resume_io(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size);
extern sector_t drbd_new_dev_size(struct drbd_conf *,
struct drbd_backing_dev *);
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
int force);
enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
extern int drbd_worker(struct drbd_thread *thi);
extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
extern void resume_next_sg(struct drbd_conf *mdev);
extern void suspend_other_sg(struct drbd_conf *mdev);
extern int drbd_resync_finished(struct drbd_conf *mdev);
/* maybe rather drbd_main.c ? */
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev, sector_t sector, int rw);
extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
static inline void ov_oos_print(struct drbd_conf *mdev)
{
if (mdev->ov_last_oos_size) {
dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
(unsigned long long)mdev->ov_last_oos_start,
(unsigned long)mdev->ov_last_oos_size);
}
mdev->ov_last_oos_size=0;
}
extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
/* worker callbacks */
extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int);
extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
extern int w_io_error(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
u64 id,
sector_t sector,
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local);
extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
extern void drbd_flush_workqueue(struct drbd_conf *mdev);
/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
* mess with get_fs/set_fs, we know we are KERNEL_DS always. */
static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, int optlen)
{
int err;
if (level == SOL_SOCKET)
err = sock_setsockopt(sock, level, optname, optval, optlen);
else
err = sock->ops->setsockopt(sock, level, optname, optval,
optlen);
return err;
}
static inline void drbd_tcp_cork(struct socket *sock)
{
int __user val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
(char __user *)&val, sizeof(val));
}
static inline void drbd_tcp_uncork(struct socket *sock)
{
int __user val = 0;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
(char __user *)&val, sizeof(val));
}
static inline void drbd_tcp_nodelay(struct socket *sock)
{
int __user val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
(char __user *)&val, sizeof(val));
}
static inline void drbd_tcp_quickack(struct socket *sock)
{
int __user val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
(char __user *)&val, sizeof(val));
}
void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
extern struct file_operations drbd_proc_fops;
extern const char *drbd_conn_str(enum drbd_conns s);
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
extern void drbd_rs_cancel_all(struct drbd_conf *mdev);
extern int drbd_rs_del_all(struct drbd_conf *mdev);
extern void drbd_rs_failed_io(struct drbd_conf *mdev,
sector_t sector, int size);
extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
#define drbd_set_in_sync(mdev, sector, size) \
__drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__)
extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
#define drbd_set_out_of_sync(mdev, sector, size) \
__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev);
extern void drbd_al_shrink(struct drbd_conf *mdev);
/* drbd_nl.c */
void drbd_nl_cleanup(void);
int __init drbd_nl_init(void);
void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
void drbd_bcast_sync_progress(struct drbd_conf *mdev);
void drbd_bcast_ee(struct drbd_conf *mdev,
const char *reason, const int dgs,
const char* seen_hash, const char* calc_hash,
const struct drbd_epoch_entry* e);
/**
* DOC: DRBD State macros
*
* These macros are used to express state changes in easily readable form.
*
* The NS macros expand to a mask and a value, that can be bit ored onto the
* current state as soon as the spinlock (req_lock) was taken.
*
* The _NS macros are used for state functions that get called with the
* spinlock. These macros expand directly to the new state value.
*
* Besides the basic forms NS() and _NS() additional _?NS[23] are defined
* to express state changes that affect more than one aspect of the state.
*
* E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
* Means that the network connection was established and that the peer
* is in secondary role.
*/
#define role_MASK R_MASK
#define peer_MASK R_MASK
#define disk_MASK D_MASK
#define pdsk_MASK D_MASK
#define conn_MASK C_MASK
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T = (S); val; })
#define NS2(T1, S1, T2, S2) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val; })
#define NS3(T1, S1, T2, S2, T3, S3) \
({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
({ union drbd_state val; val.i = 0; val.T1 = (S1); \
val.T2 = (S2); val.T3 = (S3); val; })
#define _NS(D, T, S) \
D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
#define _NS2(D, T1, S1, T2, S2) \
D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
__ns.T2 = (S2); __ns; })
#define _NS3(D, T1, S1, T2, S2, T3, S3) \
D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
/*
* inline helper functions
*************************/
static inline void drbd_state_lock(struct drbd_conf *mdev)
{
wait_event(mdev->misc_wait,
!test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags));
}
static inline void drbd_state_unlock(struct drbd_conf *mdev)
{
clear_bit(CLUSTER_ST_CHANGE, &mdev->flags);
wake_up(&mdev->misc_wait);
}
static inline int _drbd_set_state(struct drbd_conf *mdev,
union drbd_state ns, enum chg_state_flags flags,
struct completion *done)
{
int rv;
read_lock(&global_state_lock);
rv = __drbd_set_state(mdev, ns, flags, done);
read_unlock(&global_state_lock);
return rv;
}
/**
* drbd_request_state() - Reqest a state change
* @mdev: DRBD device.
* @mask: mask of state bits to change.
* @val: value of new state bits.
*
* This is the most graceful way of requesting a state change. It is verbose
* quite verbose in case the state change is not possible, and all those
* state changes are globally serialized.
*/
static inline int drbd_request_state(struct drbd_conf *mdev,
union drbd_state mask,
union drbd_state val)
{
return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
}
#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where)
{
switch (mdev->ldev->dc.on_io_error) {
case EP_PASS_ON:
if (!forcedetach) {
if (printk_ratelimit())
dev_err(DEV, "Local IO failed in %s."
"Passing error on...\n", where);
break;
}
/* NOTE fall through to detach case if forcedetach set */
case EP_DETACH:
case EP_CALL_HELPER:
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
dev_err(DEV, "Local IO failed in %s."
"Detaching...\n", where);
}
break;
}
}
/**
* drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
* @mdev: DRBD device.
* @error: Error code passed to the IO completion callback
* @forcedetach: Force detach. I.e. the error happened while accessing the meta data
*
* See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
*/
#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
static inline void drbd_chk_io_error_(struct drbd_conf *mdev,
int error, int forcedetach, const char *where)
{
if (error) {
unsigned long flags;
spin_lock_irqsave(&mdev->req_lock, flags);
__drbd_chk_io_error_(mdev, forcedetach, where);
spin_unlock_irqrestore(&mdev->req_lock, flags);
}
}
/**
* drbd_md_first_sector() - Returns the first sector number of the meta data area
* @bdev: Meta data block device.
*
* BTW, for internal meta data, this happens to be the maximum capacity
* we could agree upon with our peer node.
*/
static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
{
switch (bdev->dc.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
case DRBD_MD_INDEX_FLEX_EXT:
default:
return bdev->md.md_offset;
}
}
/**
* drbd_md_last_sector() - Return the last sector number of the meta data area
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
switch (bdev->dc.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + MD_AL_OFFSET - 1;
case DRBD_MD_INDEX_FLEX_EXT:
default:
return bdev->md.md_offset + bdev->md.md_size_sect;
}
}
/* Returns the number of 512 byte sectors of the device */
static inline sector_t drbd_get_capacity(struct block_device *bdev)
{
/* return bdev ? get_capacity(bdev->bd_disk) : 0; */
return bdev ? bdev->bd_inode->i_size >> 9 : 0;
}
/**
* drbd_get_max_capacity() - Returns the capacity we announce to out peer
* @bdev: Meta data block device.
*
* returns the capacity we announce to out peer. we clip ourselves at the
* various MAX_SECTORS, because if we don't, current implementation will
* oops sooner or later
*/
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
switch (bdev->dc.meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
drbd_md_first_sector(bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
drbd_get_capacity(bdev->backing_bdev));
/* clip at maximum size the meta device can support */
s = min_t(sector_t, s,
BM_EXT_TO_SECT(bdev->md.md_size_sect
- bdev->md.bm_offset));
break;
default:
s = min_t(sector_t, DRBD_MAX_SECTORS,
drbd_get_capacity(bdev->backing_bdev));
}
return s;
}
/**
* drbd_md_ss__() - Return the sector number of our meta data super block
* @mdev: DRBD device.
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
switch (bdev->dc.meta_dev_idx) {
default: /* external, some index */
return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
case DRBD_MD_INDEX_INTERNAL:
/* with drbd08, internal meta data is always "flexible" */
case DRBD_MD_INDEX_FLEX_INT:
/* sizeof(struct md_on_disk_07) == 4k
* position: last 4k aligned block of 4k size */
if (!bdev->backing_bdev) {
if (__ratelimit(&drbd_ratelimit_state)) {
dev_err(DEV, "bdev->backing_bdev==NULL\n");
dump_stack();
}
return 0;
}
return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL)
- MD_AL_OFFSET;
case DRBD_MD_INDEX_FLEX_EXT:
return 0;
}
}
static inline void
_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
{
list_add_tail(&w->list, &q->q);
up(&q->s);
}
static inline void
drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
{
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add(&w->list, &q->q);
up(&q->s); /* within the spinlock,
see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
}
static inline void
drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
{
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add_tail(&w->list, &q->q);
up(&q->s); /* within the spinlock,
see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
}
static inline void wake_asender(struct drbd_conf *mdev)
{
if (test_bit(SIGNAL_ASENDER, &mdev->flags))
force_sig(DRBD_SIG, mdev->asender.task);
}
static inline void request_ping(struct drbd_conf *mdev)
{
set_bit(SEND_PING, &mdev->flags);
wake_asender(mdev);
}
static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
enum drbd_packets cmd)
{
struct p_header h;
return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
}
static inline int drbd_send_ping(struct drbd_conf *mdev)
{
struct p_header h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
}
static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
{
struct p_header h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
}
static inline void drbd_thread_stop(struct drbd_thread *thi)
{
_drbd_thread_stop(thi, FALSE, TRUE);
}
static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
{
_drbd_thread_stop(thi, FALSE, FALSE);
}
static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
{
_drbd_thread_stop(thi, TRUE, FALSE);
}
/* counts how many answer packets packets we expect from our peer,
* for either explicit application requests,
* or implicit barrier packets as necessary.
* increased:
* w_send_barrier
* _req_mod(req, queue_for_net_write or queue_for_net_read);
* it is much easier and equally valid to count what we queue for the
* worker, even before it actually was queued or send.
* (drbd_make_request_common; recovery path on read io-error)
* decreased:
* got_BarrierAck (respective tl_clear, tl_clear_barrier)
* _req_mod(req, data_received)
* [from receive_DataReply]
* _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
* [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
* for some reason it is NOT decreased in got_NegAck,
* but in the resulting cleanup code from report_params.
* we should try to remember the reason for that...
* _req_mod(req, send_failed or send_canceled)
* _req_mod(req, connection_lost_while_pending)
* [from tl_clear_barrier]
*/
static inline void inc_ap_pending(struct drbd_conf *mdev)
{
atomic_inc(&mdev->ap_pending_cnt);
}
#define ERR_IF_CNT_IS_NEGATIVE(which) \
if (atomic_read(&mdev->which) < 0) \
dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
__func__ , __LINE__ , \
atomic_read(&mdev->which))
#define dec_ap_pending(mdev) do { \
typecheck(struct drbd_conf *, mdev); \
if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
wake_up(&mdev->misc_wait); \
ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
/* counts how many resync-related answers we still expect from the peer
* increase decrease
* C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
* C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER)
* (or P_NEG_ACK with ID_SYNCER)
*/
static inline void inc_rs_pending(struct drbd_conf *mdev)
{
atomic_inc(&mdev->rs_pending_cnt);
}
#define dec_rs_pending(mdev) do { \
typecheck(struct drbd_conf *, mdev); \
atomic_dec(&mdev->rs_pending_cnt); \
ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
/* counts how many answers we still need to send to the peer.
* increased on
* receive_Data unless protocol A;
* we need to send a P_RECV_ACK (proto B)
* or P_WRITE_ACK (proto C)
* receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
* receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
* receive_Barrier_* we need to send a P_BARRIER_ACK
*/
static inline void inc_unacked(struct drbd_conf *mdev)
{
atomic_inc(&mdev->unacked_cnt);
}
#define dec_unacked(mdev) do { \
typecheck(struct drbd_conf *, mdev); \
atomic_dec(&mdev->unacked_cnt); \
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
#define sub_unacked(mdev, n) do { \
typecheck(struct drbd_conf *, mdev); \
atomic_sub(n, &mdev->unacked_cnt); \
ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
static inline void put_net_conf(struct drbd_conf *mdev)
{
if (atomic_dec_and_test(&mdev->net_cnt))
wake_up(&mdev->misc_wait);
}
/**
* get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
* @mdev: DRBD device.
*
* You have to call put_net_conf() when finished working with mdev->net_conf.
*/
static inline int get_net_conf(struct drbd_conf *mdev)
{
int have_net_conf;
atomic_inc(&mdev->net_cnt);
have_net_conf = mdev->state.conn >= C_UNCONNECTED;
if (!have_net_conf)
put_net_conf(mdev);
return have_net_conf;
}
/**
* get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev
* @M: DRBD device.
*
* You have to call put_ldev() when finished working with mdev->ldev.
*/
#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT))
#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS))
static inline void put_ldev(struct drbd_conf *mdev)
{
__release(local);
if (atomic_dec_and_test(&mdev->local_cnt))
wake_up(&mdev->misc_wait);
D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
}
#ifndef __CHECKER__
static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
{
int io_allowed;
atomic_inc(&mdev->local_cnt);
io_allowed = (mdev->state.disk >= mins);
if (!io_allowed)
put_ldev(mdev);
return io_allowed;
}
#else
extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins);
#endif
/* you must have an "get_ldev" reference */
static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
unsigned long *bits_left, unsigned int *per_mil_done)
{
/*
* this is to break it at compile time when we change that
* (we may feel 4TB maximum storage per drbd is not enough)
*/
typecheck(unsigned long, mdev->rs_total);
/* note: both rs_total and rs_left are in bits, i.e. in
* units of BM_BLOCK_SIZE.
* for the percentage, we don't care. */
*bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
/* >> 10 to prevent overflow,
* +1 to prevent division by zero */
if (*bits_left > mdev->rs_total) {
/* doh. maybe a logic bug somewhere.
* may also be just a race condition
* between this and a disconnect during sync.
* for now, just prevent in-kernel buffer overflow.
*/
smp_rmb();
dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n",
drbd_conn_str(mdev->state.conn),
*bits_left, mdev->rs_total, mdev->rs_failed);
*per_mil_done = 0;
} else {
/* make sure the calculation happens in long context */
unsigned long tmp = 1000UL -
(*bits_left >> 10)*1000UL
/ ((mdev->rs_total >> 10) + 1UL);
*per_mil_done = tmp;
}
}
/* this throttles on-the-fly application requests
* according to max_buffers settings;
* maybe re-implement using semaphores? */
static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
{
int mxb = 1000000; /* arbitrary limit on open requests */
if (get_net_conf(mdev)) {
mxb = mdev->net_conf->max_buffers;
put_net_conf(mdev);
}
return mxb;
}
static inline int drbd_state_is_stable(union drbd_state s)
{
/* DO NOT add a default clause, we want the compiler to warn us
* for any newly introduced state we may have forgotten to add here */
switch ((enum drbd_conns)s.conn) {
/* new io only accepted when there is no connection, ... */
case C_STANDALONE:
case C_WF_CONNECTION:
/* ... or there is a well established connection. */
case C_CONNECTED:
case C_SYNC_SOURCE:
case C_SYNC_TARGET:
case C_VERIFY_S:
case C_VERIFY_T:
case C_PAUSED_SYNC_S:
case C_PAUSED_SYNC_T:
/* maybe stable, look at the disk state */
break;
/* no new io accepted during tansitional states
* like handshake or teardown */
case C_DISCONNECTING:
case C_UNCONNECTED:
case C_TIMEOUT:
case C_BROKEN_PIPE:
case C_NETWORK_FAILURE:
case C_PROTOCOL_ERROR:
case C_TEAR_DOWN:
case C_WF_REPORT_PARAMS:
case C_STARTING_SYNC_S:
case C_STARTING_SYNC_T:
case C_WF_BITMAP_S:
case C_WF_BITMAP_T:
case C_WF_SYNC_UUID:
case C_MASK:
/* not "stable" */
return 0;
}
switch ((enum drbd_disk_state)s.disk) {
case D_DISKLESS:
case D_INCONSISTENT:
case D_OUTDATED:
case D_CONSISTENT:
case D_UP_TO_DATE:
/* disk state is stable as well. */
break;
/* no new io accepted during tansitional states */
case D_ATTACHING:
case D_FAILED:
case D_NEGOTIATING:
case D_UNKNOWN:
case D_MASK:
/* not "stable" */
return 0;
}
return 1;
}
static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
if (mdev->state.susp)
return 0;
if (test_bit(SUSPEND_IO, &mdev->flags))
return 0;
/* to avoid potential deadlock or bitmap corruption,
* in various places, we only allow new application io
* to start during "stable" states. */
/* no new io accepted when attaching or detaching the disk */
if (!drbd_state_is_stable(mdev->state))
return 0;
/* since some older kernels don't have atomic_add_unless,
* and we are within the spinlock anyways, we have this workaround. */
if (atomic_read(&mdev->ap_bio_cnt) > mxb)
return 0;
if (test_bit(BITMAP_IO, &mdev->flags))
return 0;
return 1;
}
/* I'd like to use wait_event_lock_irq,
* but I'm not sure when it got introduced,
* and not sure when it has 3 or 4 arguments */
static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
{
/* compare with after_state_ch,
* os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
DEFINE_WAIT(wait);
/* we wait here
* as long as the device is suspended
* until the bitmap is no longer on the fly during connection
* handshake as long as we would exeed the max_buffer limit.
*
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
spin_lock_irq(&mdev->req_lock);
while (!__inc_ap_bio_cond(mdev)) {
prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
spin_unlock_irq(&mdev->req_lock);
schedule();
finish_wait(&mdev->misc_wait, &wait);
spin_lock_irq(&mdev->req_lock);
}
atomic_add(one_or_two, &mdev->ap_bio_cnt);
spin_unlock_irq(&mdev->req_lock);
}
static inline void dec_ap_bio(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt);
D_ASSERT(ap_bio >= 0);
/* this currently does wake_up for every dec_ap_bio!
* maybe rather introduce some type of hysteresis?
* e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
if (ap_bio < mxb)
wake_up(&mdev->misc_wait);
if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
}
}
static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
{
mdev->ed_uuid = val;
}
static inline int seq_cmp(u32 a, u32 b)
{
/* we assume wrap around at 32bit.
* for wrap around at 24bit (old atomic_t),
* we'd have to
* a <<= 8; b <<= 8;
*/
return (s32)(a) - (s32)(b);
}
#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
/* CAUTION: please no side effects in arguments! */
#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
{
unsigned int m;
spin_lock(&mdev->peer_seq_lock);
m = seq_max(mdev->peer_seq, new_seq);
mdev->peer_seq = m;
spin_unlock(&mdev->peer_seq_lock);
if (m == new_seq)
wake_up(&mdev->seq_wait);
}
static inline void drbd_update_congested(struct drbd_conf *mdev)
{
struct sock *sk = mdev->data.socket->sk;
if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
set_bit(NET_CONGESTED, &mdev->flags);
}
static inline int drbd_queue_order_type(struct drbd_conf *mdev)
{
/* sorry, we currently have no working implementation
* of distributed TCQ stuff */
#ifndef QUEUE_ORDERED_NONE
#define QUEUE_ORDERED_NONE 0
#endif
return QUEUE_ORDERED_NONE;
}
static inline void drbd_blk_run_queue(struct request_queue *q)
{
if (q && q->unplug_fn)
q->unplug_fn(q);
}
static inline void drbd_kick_lo(struct drbd_conf *mdev)
{
if (get_ldev(mdev)) {
drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev));
put_ldev(mdev);
}
}
static inline void drbd_md_flush(struct drbd_conf *mdev)
{
int r;
if (test_bit(MD_NO_BARRIER, &mdev->flags))
return;
r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
if (r) {
set_bit(MD_NO_BARRIER, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
}
#endif
This source diff could not be displayed because it is too large. You can view the blob instead.
/*
drbd_nl.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/autoconf.h>
#include <linux/module.h>
#include <linux/drbd.h>
#include <linux/in.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/connector.h>
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
#include "drbd_tracing.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
/* see get_sb_bdev and bd_claim */
static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
/* Generate the tag_list to struct functions */
#define NL_PACKET(name, number, fields) \
static int name ## _from_tags(struct drbd_conf *mdev, \
unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
static int name ## _from_tags(struct drbd_conf *mdev, \
unsigned short *tags, struct name *arg) \
{ \
int tag; \
int dlen; \
\
while ((tag = get_unaligned(tags++)) != TT_END) { \
dlen = get_unaligned(tags++); \
switch (tag_number(tag)) { \
fields \
default: \
if (tag & T_MANDATORY) { \
dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
return 0; \
} \
} \
tags = (unsigned short *)((char *)tags + dlen); \
} \
return 1; \
}
#define NL_INTEGER(pn, pr, member) \
case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
arg->member = get_unaligned((int *)(tags)); \
break;
#define NL_INT64(pn, pr, member) \
case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
arg->member = get_unaligned((u64 *)(tags)); \
break;
#define NL_BIT(pn, pr, member) \
case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
arg->member = *(char *)(tags) ? 1 : 0; \
break;
#define NL_STRING(pn, pr, member, len) \
case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
if (dlen > len) { \
dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
#member, dlen, (unsigned int)len); \
return 0; \
} \
arg->member ## _len = dlen; \
memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
break;
#include "linux/drbd_nl.h"
/* Generate the struct to tag_list functions */
#define NL_PACKET(name, number, fields) \
static unsigned short* \
name ## _to_tags(struct drbd_conf *mdev, \
struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
static unsigned short* \
name ## _to_tags(struct drbd_conf *mdev, \
struct name *arg, unsigned short *tags) \
{ \
fields \
return tags; \
}
#define NL_INTEGER(pn, pr, member) \
put_unaligned(pn | pr | TT_INTEGER, tags++); \
put_unaligned(sizeof(int), tags++); \
put_unaligned(arg->member, (int *)tags); \
tags = (unsigned short *)((char *)tags+sizeof(int));
#define NL_INT64(pn, pr, member) \
put_unaligned(pn | pr | TT_INT64, tags++); \
put_unaligned(sizeof(u64), tags++); \
put_unaligned(arg->member, (u64 *)tags); \
tags = (unsigned short *)((char *)tags+sizeof(u64));
#define NL_BIT(pn, pr, member) \
put_unaligned(pn | pr | TT_BIT, tags++); \
put_unaligned(sizeof(char), tags++); \
*(char *)tags = arg->member; \
tags = (unsigned short *)((char *)tags+sizeof(char));
#define NL_STRING(pn, pr, member, len) \
put_unaligned(pn | pr | TT_STRING, tags++); \
put_unaligned(arg->member ## _len, tags++); \
memcpy(tags, arg->member, arg->member ## _len); \
tags = (unsigned short *)((char *)tags + arg->member ## _len);
#include "linux/drbd_nl.h"
void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
void drbd_nl_send_reply(struct cn_msg *, int);
int drbd_khelper(struct drbd_conf *mdev, char *cmd)
{
char *envp[] = { "HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
NULL, /* Will be set to address family */
NULL, /* Will be set to address */
NULL };
char mb[12], af[20], ad[60], *afs;
char *argv[] = {usermode_helper, cmd, mb, NULL };
int ret;
snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
if (get_net_conf(mdev)) {
switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
case AF_INET6:
afs = "ipv6";
snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
&((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
break;
case AF_INET:
afs = "ipv4";
snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
&((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
break;
default:
afs = "ssocks";
snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
&((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
}
snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
envp[3]=af;
envp[4]=ad;
put_net_conf(mdev);
}
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
drbd_bcast_ev_helper(mdev, cmd);
ret = call_usermodehelper(usermode_helper, argv, envp, 1);
if (ret)
dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
else
dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0;
return ret;
}
enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
{
char *ex_to_string;
int r;
enum drbd_disk_state nps;
enum drbd_fencing_p fp;
D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
if (get_ldev_if_state(mdev, D_CONSISTENT)) {
fp = mdev->ldev->dc.fencing;
put_ldev(mdev);
} else {
dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
return mdev->state.pdsk;
}
if (fp == FP_STONITH)
_drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
r = drbd_khelper(mdev, "fence-peer");
switch ((r>>8) & 0xff) {
case 3: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
nps = D_INCONSISTENT;
break;
case 4: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
nps = D_OUTDATED;
break;
case 5: /* peer was down */
if (mdev->state.disk == D_UP_TO_DATE) {
/* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
nps = D_OUTDATED;
} else {
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
nps = mdev->state.pdsk;
}
break;
case 6: /* Peer is primary, voluntarily outdate myself.
* This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
dev_warn(DEV, "Peer is primary, outdating myself.\n");
nps = D_UNKNOWN;
_drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
break;
case 7:
if (fp != FP_STONITH)
dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
nps = D_OUTDATED;
break;
default:
/* The script is broken ... */
nps = D_UNKNOWN;
dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
return nps;
}
dev_info(DEV, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
return nps;
}
int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
{
const int max_tries = 4;
int r = 0;
int try = 0;
int forced = 0;
union drbd_state mask, val;
enum drbd_disk_state nps;
if (new_role == R_PRIMARY)
request_ping(mdev); /* Detect a dead peer ASAP */
mutex_lock(&mdev->state_mutex);
mask.i = 0; mask.role = R_MASK;
val.i = 0; val.role = new_role;
while (try++ < max_tries) {
r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE);
/* in case we first succeeded to outdate,
* but now suddenly could establish a connection */
if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
val.pdsk = 0;
mask.pdsk = 0;
continue;
}
if (r == SS_NO_UP_TO_DATE_DISK && force &&
(mdev->state.disk == D_INCONSISTENT ||
mdev->state.disk == D_OUTDATED)) {
mask.disk = D_MASK;
val.disk = D_UP_TO_DATE;
forced = 1;
continue;
}
if (r == SS_NO_UP_TO_DATE_DISK &&
mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
nps = drbd_try_outdate_peer(mdev);
if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
val.disk = D_UP_TO_DATE;
mask.disk = D_MASK;
}
val.pdsk = nps;
mask.pdsk = D_MASK;
continue;
}
if (r == SS_NOTHING_TO_DO)
goto fail;
if (r == SS_PRIMARY_NOP && mask.pdsk == 0) {
nps = drbd_try_outdate_peer(mdev);
if (force && nps > D_OUTDATED) {
dev_warn(DEV, "Forced into split brain situation!\n");
nps = D_OUTDATED;
}
mask.pdsk = D_MASK;
val.pdsk = nps;
continue;
}
if (r == SS_TWO_PRIMARIES) {
/* Maybe the peer is detected as dead very soon...
retry at most once more in this case. */
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10);
if (try < max_tries)
try = max_tries - 1;
continue;
}
if (r < SS_SUCCESS) {
r = _drbd_request_state(mdev, mask, val,
CS_VERBOSE + CS_WAIT_COMPLETE);
if (r < SS_SUCCESS)
goto fail;
}
break;
}
if (r < SS_SUCCESS)
goto fail;
if (forced)
dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
/* Wait until nothing is on the fly :) */
wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
if (new_role == R_SECONDARY) {
set_disk_ro(mdev->vdisk, TRUE);
if (get_ldev(mdev)) {
mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
put_ldev(mdev);
}
} else {
if (get_net_conf(mdev)) {
mdev->net_conf->want_lose = 0;
put_net_conf(mdev);
}
set_disk_ro(mdev->vdisk, FALSE);
if (get_ldev(mdev)) {
if (((mdev->state.conn < C_CONNECTED ||
mdev->state.pdsk <= D_FAILED)
&& mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced)
drbd_uuid_new_current(mdev);
mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
put_ldev(mdev);
}
}
if ((new_role == R_SECONDARY) && get_ldev(mdev)) {
drbd_al_to_on_disk_bm(mdev);
put_ldev(mdev);
}
if (mdev->state.conn >= C_WF_REPORT_PARAMS) {
/* if this was forced, we should consider sync */
if (forced)
drbd_send_uuids(mdev);
drbd_send_state(mdev);
}
drbd_md_sync(mdev);
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
mutex_unlock(&mdev->state_mutex);
return r;
}
static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
struct primary primary_args;
memset(&primary_args, 0, sizeof(struct primary));
if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
reply->ret_code = ERR_MANDATORY_TAG;
return 0;
}
reply->ret_code =
drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer);
return 0;
}
static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
return 0;
}
/* initializes the md.*_offset members, so we are able to find
* the on disk meta data */
static void drbd_md_set_sector_offsets(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
sector_t md_size_sect = 0;
switch (bdev->dc.meta_dev_idx) {
default:
/* v07 style fixed size indexed meta data */
bdev->md.md_size_sect = MD_RESERVED_SECT;
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
bdev->md.al_offset = MD_AL_OFFSET;
bdev->md.bm_offset = MD_BM_OFFSET;
break;
case DRBD_MD_INDEX_FLEX_EXT:
/* just occupy the full device; unit: sectors */
bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
bdev->md.md_offset = 0;
bdev->md.al_offset = MD_AL_OFFSET;
bdev->md.bm_offset = MD_BM_OFFSET;
break;
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */
bdev->md.al_offset = -MD_AL_MAX_SIZE;
/* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
md_size_sect = BM_SECT_TO_EXT(md_size_sect);
md_size_sect = ALIGN(md_size_sect, 8);
/* plus the "drbd meta data super block",
* and the activity log; */
md_size_sect += MD_BM_OFFSET;
bdev->md.md_size_sect = md_size_sect;
/* bitmap offset is adjusted by 'super' block size */
bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
break;
}
}
char *ppsize(char *buf, unsigned long long size)
{
/* Needs 9 bytes at max. */
static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
int base = 0;
while (size >= 10000) {
/* shift + round */
size = (size >> 10) + !!(size & (1<<9));
base++;
}
sprintf(buf, "%lu %cB", (long)size, units[base]);
return buf;
}
/* there is still a theoretical deadlock when called from receiver
* on an D_INCONSISTENT R_PRIMARY:
* remote READ does inc_ap_bio, receiver would need to receive answer
* packet from remote to dec_ap_bio again.
* receiver receive_sizes(), comes here,
* waits for ap_bio_cnt == 0. -> deadlock.
* but this cannot happen, actually, because:
* R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
* (not connected, or bad/no disk on peer):
* see drbd_fail_request_early, ap_bio_cnt is zero.
* R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
* peer may not initiate a resize.
*/
void drbd_suspend_io(struct drbd_conf *mdev)
{
set_bit(SUSPEND_IO, &mdev->flags);
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
}
void drbd_resume_io(struct drbd_conf *mdev)
{
clear_bit(SUSPEND_IO, &mdev->flags);
wake_up(&mdev->misc_wait);
}
/**
* drbd_determine_dev_size() - Sets the right device size obeying all constraints
* @mdev: DRBD device.
*
* Returns 0 on success, negative return values indicate errors.
* You should call drbd_md_sync() after calling this function.
*/
enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
sector_t la_size;
sector_t size;
char ppb[10];
int md_moved, la_size_changed;
enum determine_dev_size rv = unchanged;
/* race:
* application request passes inc_ap_bio,
* but then cannot get an AL-reference.
* this function later may wait on ap_bio_cnt == 0. -> deadlock.
*
* to avoid that:
* Suspend IO right here.
* still lock the act_log to not trigger ASSERTs there.
*/
drbd_suspend_io(mdev);
/* no wait necessary anymore, actually we could assert that */
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
prev_first_sect = drbd_md_first_sector(mdev->ldev);
prev_size = mdev->ldev->md.md_size_sect;
la_size = mdev->ldev->md.la_size_sect;
/* TODO: should only be some assert here, not (re)init... */
drbd_md_set_sector_offsets(mdev, mdev->ldev);
size = drbd_new_dev_size(mdev, mdev->ldev);
if (drbd_get_capacity(mdev->this_bdev) != size ||
drbd_bm_capacity(mdev) != size) {
int err;
err = drbd_bm_resize(mdev, size);
if (unlikely(err)) {
/* currently there is only one error: ENOMEM! */
size = drbd_bm_capacity(mdev)>>1;
if (size == 0) {
dev_err(DEV, "OUT OF MEMORY! "
"Could not allocate bitmap!\n");
} else {
dev_err(DEV, "BM resizing failed. "
"Leaving size unchanged at size = %lu KB\n",
(unsigned long)size);
}
rv = dev_size_error;
}
/* racy, see comments above. */
drbd_set_my_capacity(mdev, size);
mdev->ldev->md.la_size_sect = size;
dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
(unsigned long long)size>>1);
}
if (rv == dev_size_error)
goto out;
la_size_changed = (la_size != mdev->ldev->md.la_size_sect);
md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
|| prev_size != mdev->ldev->md.md_size_sect;
if (la_size_changed || md_moved) {
drbd_al_shrink(mdev); /* All extents inactive. */
dev_info(DEV, "Writing the whole bitmap, %s\n",
la_size_changed && md_moved ? "size changed and md moved" :
la_size_changed ? "size changed" : "md moved");
rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */
drbd_md_mark_dirty(mdev);
}
if (size > la_size)
rv = grew;
if (size < la_size)
rv = shrunk;
out:
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
drbd_resume_io(mdev);
return rv;
}
sector_t
drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
{
sector_t p_size = mdev->p_size; /* partner's disk size. */
sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
sector_t size = 0;
m_size = drbd_get_max_capacity(bdev);
if (p_size && m_size) {
size = min_t(sector_t, p_size, m_size);
} else {
if (la_size) {
size = la_size;
if (m_size && m_size < size)
size = m_size;
if (p_size && p_size < size)
size = p_size;
} else {
if (m_size)
size = m_size;
if (p_size)
size = p_size;
}
}
if (size == 0)
dev_err(DEV, "Both nodes diskless!\n");
if (u_size) {
if (u_size > size)
dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n",
(unsigned long)u_size>>1, (unsigned long)size>>1);
else
size = u_size;
}
return size;
}
/**
* drbd_check_al_size() - Ensures that the AL is of the right size
* @mdev: DRBD device.
*
* Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
* failed, and 0 on success. You should call drbd_md_sync() after you called
* this function.
*/
static int drbd_check_al_size(struct drbd_conf *mdev)
{
struct lru_cache *n, *t;
struct lc_element *e;
unsigned int in_use;
int i;
ERR_IF(mdev->sync_conf.al_extents < 7)
mdev->sync_conf.al_extents = 127;
if (mdev->act_log &&
mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
return 0;
in_use = 0;
t = mdev->act_log;
n = lc_create("act_log", drbd_al_ext_cache,
mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
if (n == NULL) {
dev_err(DEV, "Cannot allocate act_log lru!\n");
return -ENOMEM;
}
spin_lock_irq(&mdev->al_lock);
if (t) {
for (i = 0; i < t->nr_elements; i++) {
e = lc_element_by_index(t, i);
if (e->refcnt)
dev_err(DEV, "refcnt(%d)==%d\n",
e->lc_number, e->refcnt);
in_use += e->refcnt;
}
}
if (!in_use)
mdev->act_log = n;
spin_unlock_irq(&mdev->al_lock);
if (in_use) {
dev_err(DEV, "Activity log still in use!\n");
lc_destroy(n);
return -EBUSY;
} else {
if (t)
lc_destroy(t);
}
drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */
return 0;
}
void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local)
{
struct request_queue * const q = mdev->rq_queue;
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
int max_segments = mdev->ldev->dc.max_bio_bvecs;
if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
max_seg_s = PAGE_SIZE;
max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
blk_queue_max_sectors(q, max_seg_s >> 9);
blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS);
blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS);
blk_queue_max_segment_size(q, max_seg_s);
blk_queue_logical_block_size(q, 512);
blk_queue_segment_boundary(q, PAGE_SIZE-1);
blk_stack_limits(&q->limits, &b->limits, 0);
if (b->merge_bvec_fn)
dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
b->merge_bvec_fn);
dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
q->backing_dev_info.ra_pages,
b->backing_dev_info.ra_pages);
q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
}
}
/* serialize deconfig (worker exiting, doing cleanup)
* and reconfig (drbdsetup disk, drbdsetup net)
*
* wait for a potentially exiting worker, then restart it,
* or start a new one.
*/
static void drbd_reconfig_start(struct drbd_conf *mdev)
{
wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags));
wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
drbd_thread_start(&mdev->worker);
}
/* if still unconfigured, stops worker again.
* if configured now, clears CONFIG_PENDING.
* wakes potential waiters */
static void drbd_reconfig_done(struct drbd_conf *mdev)
{
spin_lock_irq(&mdev->req_lock);
if (mdev->state.disk == D_DISKLESS &&
mdev->state.conn == C_STANDALONE &&
mdev->state.role == R_SECONDARY) {
set_bit(DEVICE_DYING, &mdev->flags);
drbd_thread_stop_nowait(&mdev->worker);
} else
clear_bit(CONFIG_PENDING, &mdev->flags);
spin_unlock_irq(&mdev->req_lock);
wake_up(&mdev->state_wait);
}
/* does always return 0;
* interesting return code is in reply->ret_code */
static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
enum drbd_ret_codes retcode;
enum determine_dev_size dd;
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
struct inode *inode, *inode2;
struct lru_cache *resync_lru = NULL;
union drbd_state ns, os;
int rv;
int cp_discovered = 0;
int logical_block_size;
drbd_reconfig_start(mdev);
/* if you want to reconfigure, please tear down first */
if (mdev->state.disk > D_DISKLESS) {
retcode = ERR_DISK_CONFIGURED;
goto fail;
}
/* allocation not in the IO path, cqueue thread context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
if (!nbc) {
retcode = ERR_NOMEM;
goto fail;
}
nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
nbc->dc.fencing = DRBD_FENCING_DEF;
nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
retcode = ERR_MANDATORY_TAG;
goto fail;
}
if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
if (IS_ERR(nbc->lo_file)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
PTR_ERR(nbc->lo_file));
nbc->lo_file = NULL;
retcode = ERR_OPEN_DISK;
goto fail;
}
inode = nbc->lo_file->f_dentry->d_inode;
if (!S_ISBLK(inode->i_mode)) {
retcode = ERR_DISK_NOT_BDEV;
goto fail;
}
nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0);
if (IS_ERR(nbc->md_file)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
PTR_ERR(nbc->md_file));
nbc->md_file = NULL;
retcode = ERR_OPEN_MD_DISK;
goto fail;
}
inode2 = nbc->md_file->f_dentry->d_inode;
if (!S_ISBLK(inode2->i_mode)) {
retcode = ERR_MD_NOT_BDEV;
goto fail;
}
nbc->backing_bdev = inode->i_bdev;
if (bd_claim(nbc->backing_bdev, mdev)) {
printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n",
nbc->backing_bdev, mdev,
nbc->backing_bdev->bd_holder,
nbc->backing_bdev->bd_contains->bd_holder,
nbc->backing_bdev->bd_holders);
retcode = ERR_BDCLAIM_DISK;
goto fail;
}
resync_lru = lc_create("resync", drbd_bm_ext_cache,
61, sizeof(struct bm_extent),
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
goto release_bdev_fail;
}
/* meta_dev_idx >= 0: external fixed size,
* possibly multiple drbd sharing one meta device.
* TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is
* not yet used by some other drbd minor!
* (if you use drbd.conf + drbdadm,
* that should check it for you already; but if you don't, or someone
* fooled it, we need to double check here) */
nbc->md_bdev = inode2->i_bdev;
if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev
: (void *) drbd_m_holder)) {
retcode = ERR_BDCLAIM_MD_DISK;
goto release_bdev_fail;
}
if ((nbc->backing_bdev == nbc->md_bdev) !=
(nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
retcode = ERR_MD_IDX_INVALID;
goto release_bdev2_fail;
}
/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
drbd_md_set_sector_offsets(mdev, nbc);
if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
(unsigned long long) drbd_get_max_capacity(nbc),
(unsigned long long) nbc->dc.disk_size);
retcode = ERR_DISK_TO_SMALL;
goto release_bdev2_fail;
}
if (nbc->dc.meta_dev_idx < 0) {
max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
/* at least one MB, otherwise it does not make sense */
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors)
dev_warn(DEV, "truncating very big lower level device "
"to currently maximum possible %llu sectors\n",
(unsigned long long) max_possible_sectors);
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
retcode = ERR_MD_DISK_TO_SMALL;
dev_warn(DEV, "refusing attach: md-device too small, "
"at least %llu sectors needed for this meta-disk type\n",
(unsigned long long) min_md_device_sectors);
goto release_bdev2_fail;
}
/* Make sure the new disk is big enough
* (we may currently be R_PRIMARY with no local disk...) */
if (drbd_get_max_capacity(nbc) <
drbd_get_capacity(mdev->this_bdev)) {
retcode = ERR_DISK_TO_SMALL;
goto release_bdev2_fail;
}
nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
drbd_suspend_io(mdev);
/* also wait for the last barrier ack. */
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
/* and for any other previously queued work */
drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE);
drbd_resume_io(mdev);
if (retcode < SS_SUCCESS)
goto release_bdev2_fail;
if (!get_ldev_if_state(mdev, D_ATTACHING))
goto force_diskless;
drbd_md_set_sector_offsets(mdev, nbc);
if (!mdev->bitmap) {
if (drbd_bm_init(mdev)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
}
retcode = drbd_md_read(mdev, nbc);
if (retcode != NO_ERROR)
goto force_diskless_dec;
if (mdev->state.conn < C_CONNECTED &&
mdev->state.role == R_PRIMARY &&
(mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
dev_err(DEV, "Can only attach to data with current UUID=%016llX\n",
(unsigned long long)mdev->ed_uuid);
retcode = ERR_DATA_NOT_CURRENT;
goto force_diskless_dec;
}
/* Since we are diskless, fix the activity log first... */
if (drbd_check_al_size(mdev)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
/* Prevent shrinking of consistent devices ! */
if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) {
dev_warn(DEV, "refusing to truncate a consistent device\n");
retcode = ERR_DISK_TO_SMALL;
goto force_diskless_dec;
}
if (!drbd_al_read_log(mdev, nbc)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
/* allocate a second IO page if logical_block_size != 512 */
logical_block_size = bdev_logical_block_size(nbc->md_bdev);
if (logical_block_size == 0)
logical_block_size = MD_SECTOR_SIZE;
if (logical_block_size != MD_SECTOR_SIZE) {
if (!mdev->md_io_tmpp) {
struct page *page = alloc_page(GFP_NOIO);
if (!page)
goto force_diskless_dec;
dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
logical_block_size, MD_SECTOR_SIZE);
dev_warn(DEV, "Workaround engaged (has performance impact).\n");
mdev->md_io_tmpp = page;
}
}
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */
if (nbc->dc.no_md_flush)
set_bit(MD_NO_BARRIER, &mdev->flags);
else
clear_bit(MD_NO_BARRIER, &mdev->flags);
/* Point of no return reached.
* Devices and memory are no longer released by error cleanup below.
* now mdev takes over responsibility, and the state engine should
* clean it up somewhere. */
D_ASSERT(mdev->ldev == NULL);
mdev->ldev = nbc;
mdev->resync = resync_lru;
nbc = NULL;
resync_lru = NULL;
mdev->write_ordering = WO_bio_barrier;
drbd_bump_write_ordering(mdev, WO_bio_barrier);
if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
set_bit(CRASHED_PRIMARY, &mdev->flags);
else
clear_bit(CRASHED_PRIMARY, &mdev->flags);
if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
set_bit(CRASHED_PRIMARY, &mdev->flags);
cp_discovered = 1;
}
mdev->send_cnt = 0;
mdev->recv_cnt = 0;
mdev->read_cnt = 0;
mdev->writ_cnt = 0;
drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
* I just now recover from a hard crash,
* and have been R_PRIMARY before that crash.
*
* Now, if I had no connection before that crash
* (have been degraded R_PRIMARY), chances are that
* I won't find my peer now either.
*
* In that case, and _only_ in that case,
* we use the degr-wfc-timeout instead of the default,
* so we can automatically recover from a crash of a
* degraded but active "cluster" after a certain timeout.
*/
clear_bit(USE_DEGR_WFC_T, &mdev->flags);
if (mdev->state.role != R_PRIMARY &&
drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
!drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
set_bit(USE_DEGR_WFC_T, &mdev->flags);
dd = drbd_determin_dev_size(mdev);
if (dd == dev_size_error) {
retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec;
} else if (dd == grew)
set_bit(RESYNC_AFTER_NEG, &mdev->flags);
if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
dev_info(DEV, "Assuming that all blocks are out of sync "
"(aka FullSync)\n");
if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
} else {
if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
}
if (cp_discovered) {
drbd_al_apply_to_bm(mdev);
drbd_al_to_on_disk_bm(mdev);
}
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
ns.i = os.i;
/* If MDF_CONSISTENT is not set go into inconsistent state,
otherwise investigate MDF_WasUpToDate...
If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
otherwise into D_CONSISTENT state.
*/
if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) {
if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE))
ns.disk = D_CONSISTENT;
else
ns.disk = D_OUTDATED;
} else {
ns.disk = D_INCONSISTENT;
}
if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
ns.pdsk = D_OUTDATED;
if ( ns.disk == D_CONSISTENT &&
(ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
ns.disk = D_UP_TO_DATE;
/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
this point, because drbd_request_state() modifies these
flags. */
/* In case we are C_CONNECTED postpone any decision on the new disk
state after the negotiation phase. */
if (mdev->state.conn == C_CONNECTED) {
mdev->new_state_tmp.i = ns.i;
ns.i = os.i;
ns.disk = D_NEGOTIATING;
}
rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
if (rv < SS_SUCCESS)
goto force_diskless_dec;
if (mdev->state.role == R_PRIMARY)
mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
else
mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
drbd_md_mark_dirty(mdev);
drbd_md_sync(mdev);
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
put_ldev(mdev);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
return 0;
force_diskless_dec:
put_ldev(mdev);
force_diskless:
drbd_force_state(mdev, NS(disk, D_DISKLESS));
drbd_md_sync(mdev);
release_bdev2_fail:
if (nbc)
bd_release(nbc->md_bdev);
release_bdev_fail:
if (nbc)
bd_release(nbc->backing_bdev);
fail:
if (nbc) {
if (nbc->lo_file)
fput(nbc->lo_file);
if (nbc->md_file)
fput(nbc->md_file);
kfree(nbc);
}
lc_destroy(resync_lru);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
return 0;
}
static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS));
return 0;
}
static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int i, ns;
enum drbd_ret_codes retcode;
struct net_conf *new_conf = NULL;
struct crypto_hash *tfm = NULL;
struct crypto_hash *integrity_w_tfm = NULL;
struct crypto_hash *integrity_r_tfm = NULL;
struct hlist_head *new_tl_hash = NULL;
struct hlist_head *new_ee_hash = NULL;
struct drbd_conf *odev;
char hmac_name[CRYPTO_MAX_ALG_NAME];
void *int_dig_out = NULL;
void *int_dig_in = NULL;
void *int_dig_vv = NULL;
struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
drbd_reconfig_start(mdev);
if (mdev->state.conn > C_STANDALONE) {
retcode = ERR_NET_CONFIGURED;
goto fail;
}
/* allocation not in the IO path, cqueue thread context */
new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
if (!new_conf) {
retcode = ERR_NOMEM;
goto fail;
}
memset(new_conf, 0, sizeof(struct net_conf));
new_conf->timeout = DRBD_TIMEOUT_DEF;
new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
new_conf->ping_int = DRBD_PING_INT_DEF;
new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
new_conf->ko_count = DRBD_KO_COUNT_DEF;
new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
new_conf->want_lose = 0;
new_conf->two_primaries = 0;
new_conf->wire_protocol = DRBD_PROT_C;
new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
retcode = ERR_MANDATORY_TAG;
goto fail;
}
if (new_conf->two_primaries
&& (new_conf->wire_protocol != DRBD_PROT_C)) {
retcode = ERR_NOT_PROTO_C;
goto fail;
};
if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
retcode = ERR_DISCARD;
goto fail;
}
retcode = NO_ERROR;
new_my_addr = (struct sockaddr *)&new_conf->my_addr;
new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
for (i = 0; i < minor_count; i++) {
odev = minor_to_mdev(i);
if (!odev || odev == mdev)
continue;
if (get_net_conf(odev)) {
taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
!memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
retcode = ERR_LOCAL_ADDR;
taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
!memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
retcode = ERR_PEER_ADDR;
put_net_conf(odev);
if (retcode != NO_ERROR)
goto fail;
}
}
if (new_conf->cram_hmac_alg[0] != 0) {
snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
new_conf->cram_hmac_alg);
tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm)) {
tfm = NULL;
retcode = ERR_AUTH_ALG;
goto fail;
}
if (crypto_tfm_alg_type(crypto_hash_tfm(tfm))
!= CRYPTO_ALG_TYPE_HASH) {
retcode = ERR_AUTH_ALG_ND;
goto fail;
}
}
if (new_conf->integrity_alg[0]) {
integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(integrity_w_tfm)) {
integrity_w_tfm = NULL;
retcode=ERR_INTEGRITY_ALG;
goto fail;
}
if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
retcode=ERR_INTEGRITY_ALG_ND;
goto fail;
}
integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(integrity_r_tfm)) {
integrity_r_tfm = NULL;
retcode=ERR_INTEGRITY_ALG;
goto fail;
}
}
ns = new_conf->max_epoch_size/8;
if (mdev->tl_hash_s != ns) {
new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
if (!new_tl_hash) {
retcode = ERR_NOMEM;
goto fail;
}
}
ns = new_conf->max_buffers/8;
if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
if (!new_ee_hash) {
retcode = ERR_NOMEM;
goto fail;
}
}
((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
if (integrity_w_tfm) {
i = crypto_hash_digestsize(integrity_w_tfm);
int_dig_out = kmalloc(i, GFP_KERNEL);
if (!int_dig_out) {
retcode = ERR_NOMEM;
goto fail;
}
int_dig_in = kmalloc(i, GFP_KERNEL);
if (!int_dig_in) {
retcode = ERR_NOMEM;
goto fail;
}
int_dig_vv = kmalloc(i, GFP_KERNEL);
if (!int_dig_vv) {
retcode = ERR_NOMEM;
goto fail;
}
}
if (!mdev->bitmap) {
if(drbd_bm_init(mdev)) {
retcode = ERR_NOMEM;
goto fail;
}
}
spin_lock_irq(&mdev->req_lock);
if (mdev->net_conf != NULL) {
retcode = ERR_NET_CONFIGURED;
spin_unlock_irq(&mdev->req_lock);
goto fail;
}
mdev->net_conf = new_conf;
mdev->send_cnt = 0;
mdev->recv_cnt = 0;
if (new_tl_hash) {
kfree(mdev->tl_hash);
mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
mdev->tl_hash = new_tl_hash;
}
if (new_ee_hash) {
kfree(mdev->ee_hash);
mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
mdev->ee_hash = new_ee_hash;
}
crypto_free_hash(mdev->cram_hmac_tfm);
mdev->cram_hmac_tfm = tfm;
crypto_free_hash(mdev->integrity_w_tfm);
mdev->integrity_w_tfm = integrity_w_tfm;
crypto_free_hash(mdev->integrity_r_tfm);
mdev->integrity_r_tfm = integrity_r_tfm;
kfree(mdev->int_dig_out);
kfree(mdev->int_dig_in);
kfree(mdev->int_dig_vv);
mdev->int_dig_out=int_dig_out;
mdev->int_dig_in=int_dig_in;
mdev->int_dig_vv=int_dig_vv;
spin_unlock_irq(&mdev->req_lock);
retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
return 0;
fail:
kfree(int_dig_out);
kfree(int_dig_in);
kfree(int_dig_vv);
crypto_free_hash(tfm);
crypto_free_hash(integrity_w_tfm);
crypto_free_hash(integrity_r_tfm);
kfree(new_tl_hash);
kfree(new_ee_hash);
kfree(new_conf);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
return 0;
}
static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode;
retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
if (retcode == SS_NOTHING_TO_DO)
goto done;
else if (retcode == SS_ALREADY_STANDALONE)
goto done;
else if (retcode == SS_PRIMARY_NOP) {
/* Our statche checking code wants to see the peer outdated. */
retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
pdsk, D_OUTDATED));
} else if (retcode == SS_CW_FAILED_BY_PEER) {
/* The peer probably wants to see us outdated. */
retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
disk, D_OUTDATED),
CS_ORDERED);
if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
retcode = SS_SUCCESS;
}
}
if (retcode < SS_SUCCESS)
goto fail;
if (wait_event_interruptible(mdev->state_wait,
mdev->state.conn != C_DISCONNECTING)) {
/* Do not test for mdev->state.conn == C_STANDALONE, since
someone else might connect us in the mean time! */
retcode = ERR_INTR;
goto fail;
}
done:
retcode = NO_ERROR;
fail:
drbd_md_sync(mdev);
reply->ret_code = retcode;
return 0;
}
void resync_after_online_grow(struct drbd_conf *mdev)
{
int iass; /* I am sync source */
dev_info(DEV, "Resync of new storage after online grow\n");
if (mdev->state.role != mdev->state.peer)
iass = (mdev->state.role == R_PRIMARY);
else
iass = test_bit(DISCARD_CONCURRENT, &mdev->flags);
if (iass)
drbd_start_resync(mdev, C_SYNC_SOURCE);
else
_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
}
static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
struct resize rs;
int retcode = NO_ERROR;
int ldsc = 0; /* local disk size changed */
enum determine_dev_size dd;
memset(&rs, 0, sizeof(struct resize));
if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
retcode = ERR_MANDATORY_TAG;
goto fail;
}
if (mdev->state.conn > C_CONNECTED) {
retcode = ERR_RESIZE_RESYNC;
goto fail;
}
if (mdev->state.role == R_SECONDARY &&
mdev->state.peer == R_SECONDARY) {
retcode = ERR_NO_PRIMARY;
goto fail;
}
if (!get_ldev(mdev)) {
retcode = ERR_NO_DISK;
goto fail;
}
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
ldsc = 1;
}
mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
dd = drbd_determin_dev_size(mdev);
drbd_md_sync(mdev);
put_ldev(mdev);
if (dd == dev_size_error) {
retcode = ERR_NOMEM_BITMAP;
goto fail;
}
if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
if (dd == grew)
set_bit(RESIZE_PENDING, &mdev->flags);
drbd_send_uuids(mdev);
drbd_send_sizes(mdev, 1);
}
fail:
reply->ret_code = retcode;
return 0;
}
static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode = NO_ERROR;
int err;
int ovr; /* online verify running */
int rsr; /* re-sync running */
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
struct syncer_conf sc;
cpumask_var_t new_cpu_mask;
if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
retcode = ERR_NOMEM;
goto fail;
}
if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
memset(&sc, 0, sizeof(struct syncer_conf));
sc.rate = DRBD_RATE_DEF;
sc.after = DRBD_AFTER_DEF;
sc.al_extents = DRBD_AL_EXTENTS_DEF;
} else
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
retcode = ERR_MANDATORY_TAG;
goto fail;
}
/* re-sync running */
rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
mdev->state.conn == C_SYNC_TARGET ||
mdev->state.conn == C_PAUSED_SYNC_S ||
mdev->state.conn == C_PAUSED_SYNC_T );
if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
retcode = ERR_CSUMS_RESYNC_RUNNING;
goto fail;
}
if (!rsr && sc.csums_alg[0]) {
csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(csums_tfm)) {
csums_tfm = NULL;
retcode = ERR_CSUMS_ALG;
goto fail;
}
if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
retcode = ERR_CSUMS_ALG_ND;
goto fail;
}
}
/* online verify running */
ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
if (ovr) {
if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
retcode = ERR_VERIFY_RUNNING;
goto fail;
}
}
if (!ovr && sc.verify_alg[0]) {
verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(verify_tfm)) {
verify_tfm = NULL;
retcode = ERR_VERIFY_ALG;
goto fail;
}
if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
retcode = ERR_VERIFY_ALG_ND;
goto fail;
}
}
/* silently ignore cpu mask on UP kernel */
if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
err = __bitmap_parse(sc.cpu_mask, 32, 0,
cpumask_bits(new_cpu_mask), nr_cpu_ids);
if (err) {
dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
retcode = ERR_CPU_MASK_PARSE;
goto fail;
}
}
ERR_IF (sc.rate < 1) sc.rate = 1;
ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
if (sc.al_extents > AL_MAX) {
dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
sc.al_extents = AL_MAX;
}
#undef AL_MAX
/* most sanity checks done, try to assign the new sync-after
* dependency. need to hold the global lock in there,
* to avoid a race in the dependency loop check. */
retcode = drbd_alter_sa(mdev, sc.after);
if (retcode != NO_ERROR)
goto fail;
/* ok, assign the rest of it as well.
* lock against receive_SyncParam() */
spin_lock(&mdev->peer_seq_lock);
mdev->sync_conf = sc;
if (!rsr) {
crypto_free_hash(mdev->csums_tfm);
mdev->csums_tfm = csums_tfm;
csums_tfm = NULL;
}
if (!ovr) {
crypto_free_hash(mdev->verify_tfm);
mdev->verify_tfm = verify_tfm;
verify_tfm = NULL;
}
spin_unlock(&mdev->peer_seq_lock);
if (get_ldev(mdev)) {
wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
drbd_al_shrink(mdev);
err = drbd_check_al_size(mdev);
lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
put_ldev(mdev);
drbd_md_sync(mdev);
if (err) {
retcode = ERR_NOMEM;
goto fail;
}
}
if (mdev->state.conn >= C_CONNECTED)
drbd_send_sync_param(mdev, &sc);
if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
cpumask_copy(mdev->cpu_mask, new_cpu_mask);
drbd_calc_cpu_mask(mdev);
mdev->receiver.reset_cpu_mask = 1;
mdev->asender.reset_cpu_mask = 1;
mdev->worker.reset_cpu_mask = 1;
}
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
free_cpumask_var(new_cpu_mask);
crypto_free_hash(csums_tfm);
crypto_free_hash(verify_tfm);
reply->ret_code = retcode;
return 0;
}
static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode;
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION)
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
while (retcode == SS_NEED_CONNECTION) {
spin_lock_irq(&mdev->req_lock);
if (mdev->state.conn < C_CONNECTED)
retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
if (retcode != SS_NEED_CONNECTION)
break;
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
}
reply->ret_code = retcode;
return 0;
}
static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
return 0;
}
static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode = NO_ERROR;
if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
retcode = ERR_PAUSE_IS_SET;
reply->ret_code = retcode;
return 0;
}
static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode = NO_ERROR;
if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO)
retcode = ERR_PAUSE_IS_CLEAR;
reply->ret_code = retcode;
return 0;
}
static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
return 0;
}
static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
return 0;
}
static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
return 0;
}
static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
unsigned short *tl;
tl = reply->tag_list;
if (get_ldev(mdev)) {
tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
put_ldev(mdev);
}
if (get_net_conf(mdev)) {
tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
put_net_conf(mdev);
}
tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
put_unaligned(TT_END, tl++); /* Close the tag list */
return (int)((char *)tl - (char *)reply->tag_list);
}
static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
unsigned short *tl = reply->tag_list;
union drbd_state s = mdev->state;
unsigned long rs_left;
unsigned int res;
tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
/* no local ref, no bitmap, no syncer progress. */
if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
if (get_ldev(mdev)) {
drbd_get_syncer_progress(mdev, &rs_left, &res);
tl = tl_add_int(tl, T_sync_progress, &res);
put_ldev(mdev);
}
}
put_unaligned(TT_END, tl++); /* Close the tag list */
return (int)((char *)tl - (char *)reply->tag_list);
}
static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
unsigned short *tl;
tl = reply->tag_list;
if (get_ldev(mdev)) {
tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
put_ldev(mdev);
}
put_unaligned(TT_END, tl++); /* Close the tag list */
return (int)((char *)tl - (char *)reply->tag_list);
}
/**
* drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
* @mdev: DRBD device.
* @nlp: Netlink/connector packet from drbdsetup
* @reply: Reply packet for drbdsetup
*/
static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
unsigned short *tl;
char rv;
tl = reply->tag_list;
rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT;
tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
put_unaligned(TT_END, tl++); /* Close the tag list */
return (int)((char *)tl - (char *)reply->tag_list);
}
static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
/* default to resume from last known position, if possible */
struct start_ov args =
{ .start_sector = mdev->ov_start_sector };
if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
reply->ret_code = ERR_MANDATORY_TAG;
return 0;
}
/* w_make_ov_request expects position to be aligned */
mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
return 0;
}
static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
int retcode = NO_ERROR;
int skip_initial_sync = 0;
int err;
struct new_c_uuid args;
memset(&args, 0, sizeof(struct new_c_uuid));
if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
reply->ret_code = ERR_MANDATORY_TAG;
return 0;
}
mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
if (!get_ldev(mdev)) {
retcode = ERR_NO_DISK;
goto out;
}
/* this is "skip initial sync", assume to be clean */
if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
dev_info(DEV, "Preparing to skip initial sync\n");
skip_initial_sync = 1;
} else if (mdev->state.conn != C_STANDALONE) {
retcode = ERR_CONNECTED;
goto out_dec;
}
drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */
if (args.clear_bm) {
err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid");
if (err) {
dev_err(DEV, "Writing bitmap failed with %d\n",err);
retcode = ERR_IO_MD_DISK;
}
if (skip_initial_sync) {
drbd_send_uuids_skip_initial_sync(mdev);
_drbd_uuid_set(mdev, UI_BITMAP, 0);
spin_lock_irq(&mdev->req_lock);
_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
}
}
drbd_md_sync(mdev);
out_dec:
put_ldev(mdev);
out:
mutex_unlock(&mdev->state_mutex);
reply->ret_code = retcode;
return 0;
}
static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
{
struct drbd_conf *mdev;
if (nlp->drbd_minor >= minor_count)
return NULL;
mdev = minor_to_mdev(nlp->drbd_minor);
if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
struct gendisk *disk = NULL;
mdev = drbd_new_device(nlp->drbd_minor);
spin_lock_irq(&drbd_pp_lock);
if (minor_table[nlp->drbd_minor] == NULL) {
minor_table[nlp->drbd_minor] = mdev;
disk = mdev->vdisk;
mdev = NULL;
} /* else: we lost the race */
spin_unlock_irq(&drbd_pp_lock);
if (disk) /* we won the race above */
/* in case we ever add a drbd_delete_device(),
* don't forget the del_gendisk! */
add_disk(disk);
else /* we lost the race above */
drbd_free_mdev(mdev);
mdev = minor_to_mdev(nlp->drbd_minor);
}
return mdev;
}
struct cn_handler_struct {
int (*function)(struct drbd_conf *,
struct drbd_nl_cfg_req *,
struct drbd_nl_cfg_reply *);
int reply_body_size;
};
static struct cn_handler_struct cnd_table[] = {
[ P_primary ] = { &drbd_nl_primary, 0 },
[ P_secondary ] = { &drbd_nl_secondary, 0 },
[ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
[ P_detach ] = { &drbd_nl_detach, 0 },
[ P_net_conf ] = { &drbd_nl_net_conf, 0 },
[ P_disconnect ] = { &drbd_nl_disconnect, 0 },
[ P_resize ] = { &drbd_nl_resize, 0 },
[ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
[ P_invalidate ] = { &drbd_nl_invalidate, 0 },
[ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
[ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
[ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
[ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
[ P_resume_io ] = { &drbd_nl_resume_io, 0 },
[ P_outdate ] = { &drbd_nl_outdate, 0 },
[ P_get_config ] = { &drbd_nl_get_config,
sizeof(struct syncer_conf_tag_len_struct) +
sizeof(struct disk_conf_tag_len_struct) +
sizeof(struct net_conf_tag_len_struct) },
[ P_get_state ] = { &drbd_nl_get_state,
sizeof(struct get_state_tag_len_struct) +
sizeof(struct sync_progress_tag_len_struct) },
[ P_get_uuids ] = { &drbd_nl_get_uuids,
sizeof(struct get_uuids_tag_len_struct) },
[ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
sizeof(struct get_timeout_flag_tag_len_struct)},
[ P_start_ov ] = { &drbd_nl_start_ov, 0 },
[ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
};
static void drbd_connector_callback(struct cn_msg *req)
{
struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
struct cn_handler_struct *cm;
struct cn_msg *cn_reply;
struct drbd_nl_cfg_reply *reply;
struct drbd_conf *mdev;
int retcode, rr;
int reply_size = sizeof(struct cn_msg)
+ sizeof(struct drbd_nl_cfg_reply)
+ sizeof(short int);
if (!try_module_get(THIS_MODULE)) {
printk(KERN_ERR "drbd: try_module_get() failed!\n");
return;
}
mdev = ensure_mdev(nlp);
if (!mdev) {
retcode = ERR_MINOR_INVALID;
goto fail;
}
trace_drbd_netlink(req, 1);
if (nlp->packet_type >= P_nl_after_last_packet) {
retcode = ERR_PACKET_NR;
goto fail;
}
cm = cnd_table + nlp->packet_type;
/* This may happen if packet number is 0: */
if (cm->function == NULL) {
retcode = ERR_PACKET_NR;
goto fail;
}
reply_size += cm->reply_body_size;
/* allocation not in the IO path, cqueue thread context */
cn_reply = kmalloc(reply_size, GFP_KERNEL);
if (!cn_reply) {
retcode = ERR_NOMEM;
goto fail;
}
reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
reply->packet_type =
cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet;
reply->minor = nlp->drbd_minor;
reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
/* reply->tag_list; might be modified by cm->function. */
rr = cm->function(mdev, nlp, reply);
cn_reply->id = req->id;
cn_reply->seq = req->seq;
cn_reply->ack = req->ack + 1;
cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
cn_reply->flags = 0;
trace_drbd_netlink(cn_reply, 0);
rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
if (rr && rr != -ESRCH)
printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
kfree(cn_reply);
module_put(THIS_MODULE);
return;
fail:
drbd_nl_send_reply(req, retcode);
module_put(THIS_MODULE);
}
static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
static unsigned short *
__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
unsigned short len, int nul_terminated)
{
unsigned short l = tag_descriptions[tag_number(tag)].max_len;
len = (len < l) ? len : l;
put_unaligned(tag, tl++);
put_unaligned(len, tl++);
memcpy(tl, data, len);
tl = (unsigned short*)((char*)tl + len);
if (nul_terminated)
*((char*)tl - 1) = 0;
return tl;
}
static unsigned short *
tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
{
return __tl_add_blob(tl, tag, data, len, 0);
}
static unsigned short *
tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
{
return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
}
static unsigned short *
tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
{
put_unaligned(tag, tl++);
switch(tag_type(tag)) {
case TT_INTEGER:
put_unaligned(sizeof(int), tl++);
put_unaligned(*(int *)val, (int *)tl);
tl = (unsigned short*)((char*)tl+sizeof(int));
break;
case TT_INT64:
put_unaligned(sizeof(u64), tl++);
put_unaligned(*(u64 *)val, (u64 *)tl);
tl = (unsigned short*)((char*)tl+sizeof(u64));
break;
default:
/* someone did something stupid. */
;
}
return tl;
}
void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
{
char buffer[sizeof(struct cn_msg)+
sizeof(struct drbd_nl_cfg_reply)+
sizeof(struct get_state_tag_len_struct)+
sizeof(short int)];
struct cn_msg *cn_reply = (struct cn_msg *) buffer;
struct drbd_nl_cfg_reply *reply =
(struct drbd_nl_cfg_reply *)cn_reply->data;
unsigned short *tl = reply->tag_list;
/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
put_unaligned(TT_END, tl++); /* Close the tag list */
cn_reply->id.idx = CN_IDX_DRBD;
cn_reply->id.val = CN_VAL_DRBD;
cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
cn_reply->ack = 0; /* not used here. */
cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
(int)((char *)tl - (char *)reply->tag_list);
cn_reply->flags = 0;
reply->packet_type = P_get_state;
reply->minor = mdev_to_minor(mdev);
reply->ret_code = NO_ERROR;
trace_drbd_netlink(cn_reply, 0);
cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
}
void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
{
char buffer[sizeof(struct cn_msg)+
sizeof(struct drbd_nl_cfg_reply)+
sizeof(struct call_helper_tag_len_struct)+
sizeof(short int)];
struct cn_msg *cn_reply = (struct cn_msg *) buffer;
struct drbd_nl_cfg_reply *reply =
(struct drbd_nl_cfg_reply *)cn_reply->data;
unsigned short *tl = reply->tag_list;
/* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
tl = tl_add_str(tl, T_helper, helper_name);
put_unaligned(TT_END, tl++); /* Close the tag list */
cn_reply->id.idx = CN_IDX_DRBD;
cn_reply->id.val = CN_VAL_DRBD;
cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
cn_reply->ack = 0; /* not used here. */
cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
(int)((char *)tl - (char *)reply->tag_list);
cn_reply->flags = 0;
reply->packet_type = P_call_helper;
reply->minor = mdev_to_minor(mdev);
reply->ret_code = NO_ERROR;
trace_drbd_netlink(cn_reply, 0);
cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
}
void drbd_bcast_ee(struct drbd_conf *mdev,
const char *reason, const int dgs,
const char* seen_hash, const char* calc_hash,
const struct drbd_epoch_entry* e)
{
struct cn_msg *cn_reply;
struct drbd_nl_cfg_reply *reply;
struct bio_vec *bvec;
unsigned short *tl;
int i;
if (!e)
return;
if (!reason || !reason[0])
return;
/* apparently we have to memcpy twice, first to prepare the data for the
* struct cn_msg, then within cn_netlink_send from the cn_msg to the
* netlink skb. */
/* receiver thread context, which is not in the writeout path (of this node),
* but may be in the writeout path of the _other_ node.
* GFP_NOIO to avoid potential "distributed deadlock". */
cn_reply = kmalloc(
sizeof(struct cn_msg)+
sizeof(struct drbd_nl_cfg_reply)+
sizeof(struct dump_ee_tag_len_struct)+
sizeof(short int),
GFP_NOIO);
if (!cn_reply) {
dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
(unsigned long long)e->sector, e->size);
return;
}
reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
tl = reply->tag_list;
tl = tl_add_str(tl, T_dump_ee_reason, reason);
tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
tl = tl_add_int(tl, T_ee_sector, &e->sector);
tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
put_unaligned(T_ee_data, tl++);
put_unaligned(e->size, tl++);
__bio_for_each_segment(bvec, e->private_bio, i, 0) {
void *d = kmap(bvec->bv_page);
memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
kunmap(bvec->bv_page);
tl=(unsigned short*)((char*)tl + bvec->bv_len);
}
put_unaligned(TT_END, tl++); /* Close the tag list */
cn_reply->id.idx = CN_IDX_DRBD;
cn_reply->id.val = CN_VAL_DRBD;
cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
cn_reply->ack = 0; // not used here.
cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
(int)((char*)tl - (char*)reply->tag_list);
cn_reply->flags = 0;
reply->packet_type = P_dump_ee;
reply->minor = mdev_to_minor(mdev);
reply->ret_code = NO_ERROR;
trace_drbd_netlink(cn_reply, 0);
cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
kfree(cn_reply);
}
void drbd_bcast_sync_progress(struct drbd_conf *mdev)
{
char buffer[sizeof(struct cn_msg)+
sizeof(struct drbd_nl_cfg_reply)+
sizeof(struct sync_progress_tag_len_struct)+
sizeof(short int)];
struct cn_msg *cn_reply = (struct cn_msg *) buffer;
struct drbd_nl_cfg_reply *reply =
(struct drbd_nl_cfg_reply *)cn_reply->data;
unsigned short *tl = reply->tag_list;
unsigned long rs_left;
unsigned int res;
/* no local ref, no bitmap, no syncer progress, no broadcast. */
if (!get_ldev(mdev))
return;
drbd_get_syncer_progress(mdev, &rs_left, &res);
put_ldev(mdev);
tl = tl_add_int(tl, T_sync_progress, &res);
put_unaligned(TT_END, tl++); /* Close the tag list */
cn_reply->id.idx = CN_IDX_DRBD;
cn_reply->id.val = CN_VAL_DRBD;
cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
cn_reply->ack = 0; /* not used here. */
cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
(int)((char *)tl - (char *)reply->tag_list);
cn_reply->flags = 0;
reply->packet_type = P_sync_progress;
reply->minor = mdev_to_minor(mdev);
reply->ret_code = NO_ERROR;
trace_drbd_netlink(cn_reply, 0);
cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
}
int __init drbd_nl_init(void)
{
static struct cb_id cn_id_drbd;
int err, try=10;
cn_id_drbd.val = CN_VAL_DRBD;
do {
cn_id_drbd.idx = cn_idx;
err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
if (!err)
break;
cn_idx = (cn_idx + CN_IDX_STEP);
} while (try--);
if (err) {
printk(KERN_ERR "drbd: cn_drbd failed to register\n");
return err;
}
return 0;
}
void drbd_nl_cleanup(void)
{
static struct cb_id cn_id_drbd;
cn_id_drbd.idx = cn_idx;
cn_id_drbd.val = CN_VAL_DRBD;
cn_del_callback(&cn_id_drbd);
}
void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
{
char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
struct cn_msg *cn_reply = (struct cn_msg *) buffer;
struct drbd_nl_cfg_reply *reply =
(struct drbd_nl_cfg_reply *)cn_reply->data;
int rr;
cn_reply->id = req->id;
cn_reply->seq = req->seq;
cn_reply->ack = req->ack + 1;
cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
cn_reply->flags = 0;
reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
reply->ret_code = ret_code;
trace_drbd_netlink(cn_reply, 0);
rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
if (rr && rr != -ESRCH)
printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
}
/*
drbd_proc.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/autoconf.h>
#include <linux/module.h>
#include <asm/uaccess.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/drbd.h>
#include "drbd_int.h"
static int drbd_proc_open(struct inode *inode, struct file *file);
struct proc_dir_entry *drbd_proc;
struct file_operations drbd_proc_fops = {
.owner = THIS_MODULE,
.open = drbd_proc_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/*lge
* progress bars shamelessly adapted from driver/md/md.c
* output looks like
* [=====>..............] 33.5% (23456/123456)
* finish: 2:20:20 speed: 6,345 (6,456) K/sec
*/
static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
{
unsigned long db, dt, dbdt, rt, rs_left;
unsigned int res;
int i, x, y;
drbd_get_syncer_progress(mdev, &rs_left, &res);
x = res/50;
y = 20-x;
seq_printf(seq, "\t[");
for (i = 1; i < x; i++)
seq_printf(seq, "=");
seq_printf(seq, ">");
for (i = 0; i < y; i++)
seq_printf(seq, ".");
seq_printf(seq, "] ");
seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
/* if more than 1 GB display in MB */
if (mdev->rs_total > 0x100000L)
seq_printf(seq, "(%lu/%lu)M\n\t",
(unsigned long) Bit2KB(rs_left >> 10),
(unsigned long) Bit2KB(mdev->rs_total >> 10));
else
seq_printf(seq, "(%lu/%lu)K\n\t",
(unsigned long) Bit2KB(rs_left),
(unsigned long) Bit2KB(mdev->rs_total));
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
* the * 100 / 100 trick are important. We do a +1 to be
* safe against division by zero. We only estimate anyway.
*
* dt: time from mark until now
* db: blocks written from mark until now
* rt: remaining time
*/
dt = (jiffies - mdev->rs_mark_time) / HZ;
if (dt > 20) {
/* if we made no update to rs_mark_time for too long,
* we are stalled. show that. */
seq_printf(seq, "stalled\n");
return;
}
if (!dt)
dt++;
db = mdev->rs_mark_left - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
rt / 3600, (rt % 3600) / 60, rt % 60);
/* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */
dbdt = Bit2KB(db/dt);
if (dbdt > 1000)
seq_printf(seq, " speed: %ld,%03ld",
dbdt/1000, dbdt % 1000);
else
seq_printf(seq, " speed: %ld", dbdt);
/* mean speed since syncer started
* we do account for PausedSync periods */
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
if (dt <= 0)
dt = 1;
db = mdev->rs_total - rs_left;
dbdt = Bit2KB(db/dt);
if (dbdt > 1000)
seq_printf(seq, " (%ld,%03ld)",
dbdt/1000, dbdt % 1000);
else
seq_printf(seq, " (%ld)", dbdt);
seq_printf(seq, " K/sec\n");
}
static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
{
struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
seq_printf(seq, "%5d %s %s\n", bme->rs_left,
bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------",
bme->flags & BME_LOCKED ? "LOCKED" : "------"
);
}
static int drbd_seq_show(struct seq_file *seq, void *v)
{
int i, hole = 0;
const char *sn;
struct drbd_conf *mdev;
static char write_ordering_chars[] = {
[WO_none] = 'n',
[WO_drain_io] = 'd',
[WO_bdev_flush] = 'f',
[WO_bio_barrier] = 'b',
};
seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
/*
cs .. connection state
ro .. node role (local/remote)
ds .. disk state (local/remote)
protocol
various flags
ns .. network send
nr .. network receive
dw .. disk write
dr .. disk read
al .. activity log write count
bm .. bitmap update write count
pe .. pending (waiting for ack or data reply)
ua .. unack'd (still need to send ack or data reply)
ap .. application requests accepted, but not yet completed
ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
wo .. write ordering mode currently in use
oos .. known out-of-sync kB
*/
for (i = 0; i < minor_count; i++) {
mdev = minor_to_mdev(i);
if (!mdev) {
hole = 1;
continue;
}
if (hole) {
hole = 0;
seq_printf(seq, "\n");
}
sn = drbd_conn_str(mdev->state.conn);
if (mdev->state.conn == C_STANDALONE &&
mdev->state.disk == D_DISKLESS &&
mdev->state.role == R_SECONDARY) {
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
drbd_role_str(mdev->state.role),
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
mdev->state.susp ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
mdev->read_cnt/2,
mdev->al_writ_cnt,
mdev->bm_writ_cnt,
atomic_read(&mdev->local_cnt),
atomic_read(&mdev->ap_pending_cnt) +
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
mdev->epochs,
write_ordering_chars[mdev->write_ordering]
);
seq_printf(seq, " oos:%lu\n",
Bit2KB(drbd_bm_total_weight(mdev)));
}
if (mdev->state.conn == C_SYNC_SOURCE ||
mdev->state.conn == C_SYNC_TARGET)
drbd_syncer_progress(mdev, seq);
if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T)
seq_printf(seq, "\t%3d%% %lu/%lu\n",
(int)((mdev->rs_total-mdev->ov_left) /
(mdev->rs_total/100+1)),
mdev->rs_total - mdev->ov_left,
mdev->rs_total);
if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) {
lc_seq_printf_stats(seq, mdev->resync);
lc_seq_printf_stats(seq, mdev->act_log);
put_ldev(mdev);
}
if (proc_details >= 2) {
if (mdev->resync) {
lc_seq_dump_details(seq, mdev->resync, "rs_left",
resync_dump_detail);
}
}
}
return 0;
}
static int drbd_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, drbd_seq_show, PDE(inode)->data);
}
/* PROC FS stuff end */
This source diff could not be displayed because it is too large. You can view the blob instead.
/*
drbd_req.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/autoconf.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/drbd.h>
#include "drbd_int.h"
#include "drbd_tracing.h"
#include "drbd_req.h"
/* Update disk stats at start of I/O request */
static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
{
const int rw = bio_data_dir(bio);
int cpu;
cpu = part_stat_lock();
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
part_stat_unlock();
mdev->vdisk->part0.in_flight[rw]++;
}
/* Update disk stats when completing request upwards */
static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
{
int rw = bio_data_dir(req->master_bio);
unsigned long duration = jiffies - req->start_time;
int cpu;
cpu = part_stat_lock();
part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration);
part_round_stats(cpu, &mdev->vdisk->part0);
part_stat_unlock();
mdev->vdisk->part0.in_flight[rw]--;
}
static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
{
const unsigned long s = req->rq_state;
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
if (rw == WRITE) {
/* remove it from the transfer log.
* well, only if it had been there in the first
* place... if it had not (local only or conflicting
* and never sent), it should still be "empty" as
* initialized in drbd_req_new(), so we can list_del() it
* here unconditionally */
list_del(&req->tl_requests);
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
* READ with local io-error */
if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
drbd_set_out_of_sync(mdev, req->sector, req->size);
if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
drbd_set_in_sync(mdev, req->sector, req->size);
/* one might be tempted to move the drbd_al_complete_io
* to the local io completion callback drbd_endio_pri.
* but, if this was a mirror write, we may only
* drbd_al_complete_io after this is RQ_NET_DONE,
* otherwise the extent could be dropped from the al
* before it has actually been written on the peer.
* if we crash before our peer knows about the request,
* but after the extent has been dropped from the al,
* we would forget to resync the corresponding extent.
*/
if (s & RQ_LOCAL_MASK) {
if (get_ldev_if_state(mdev, D_FAILED)) {
drbd_al_complete_io(mdev, req->sector);
put_ldev(mdev);
} else if (__ratelimit(&drbd_ratelimit_state)) {
dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
"but my Disk seems to have failed :(\n",
(unsigned long long) req->sector);
}
}
}
/* if it was a local io error, we want to notify our
* peer about that, and see if we need to
* detach the disk and stuff.
* to avoid allocating some special work
* struct, reuse the request. */
/* THINK
* why do we do this not when we detect the error,
* but delay it until it is "done", i.e. possibly
* until the next barrier ack? */
if (rw == WRITE &&
((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) {
if (!(req->w.list.next == LIST_POISON1 ||
list_empty(&req->w.list))) {
/* DEBUG ASSERT only; if this triggers, we
* probably corrupt the worker list here */
dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next);
dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev);
}
req->w.cb = w_io_error;
drbd_queue_work(&mdev->data.work, &req->w);
/* drbd_req_free() is done in w_io_error */
} else {
drbd_req_free(req);
}
}
static void queue_barrier(struct drbd_conf *mdev)
{
struct drbd_tl_epoch *b;
/* We are within the req_lock. Once we queued the barrier for sending,
* we set the CREATE_BARRIER bit. It is cleared as soon as a new
* barrier/epoch object is added. This is the only place this bit is
* set. It indicates that the barrier for this epoch is already queued,
* and no new epoch has been created yet. */
if (test_bit(CREATE_BARRIER, &mdev->flags))
return;
b = mdev->newest_tle;
b->w.cb = w_send_barrier;
/* inc_ap_pending done here, so we won't
* get imbalanced on connection loss.
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in tl_clear. */
inc_ap_pending(mdev);
drbd_queue_work(&mdev->data.work, &b->w);
set_bit(CREATE_BARRIER, &mdev->flags);
}
static void _about_to_complete_local_write(struct drbd_conf *mdev,
struct drbd_request *req)
{
const unsigned long s = req->rq_state;
struct drbd_request *i;
struct drbd_epoch_entry *e;
struct hlist_node *n;
struct hlist_head *slot;
/* before we can signal completion to the upper layers,
* we may need to close the current epoch */
if (mdev->state.conn >= C_CONNECTED &&
req->epoch == mdev->newest_tle->br_number)
queue_barrier(mdev);
/* we need to do the conflict detection stuff,
* if we have the ee_hash (two_primaries) and
* this has been on the network */
if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
const sector_t sector = req->sector;
const int size = req->size;
/* ASSERT:
* there must be no conflicting requests, since
* they must have been failed on the spot */
#define OVERLAPS overlaps(sector, size, i->sector, i->size)
slot = tl_hash_slot(mdev, sector);
hlist_for_each_entry(i, n, slot, colision) {
if (OVERLAPS) {
dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
"other: %p %llus +%u\n",
req, (unsigned long long)sector, size,
i, (unsigned long long)i->sector, i->size);
}
}
/* maybe "wake" those conflicting epoch entries
* that wait for this request to finish.
*
* currently, there can be only _one_ such ee
* (well, or some more, which would be pending
* P_DISCARD_ACK not yet sent by the asender...),
* since we block the receiver thread upon the
* first conflict detection, which will wait on
* misc_wait. maybe we want to assert that?
*
* anyways, if we found one,
* we just have to do a wake_up. */
#undef OVERLAPS
#define OVERLAPS overlaps(sector, size, e->sector, e->size)
slot = ee_hash_slot(mdev, req->sector);
hlist_for_each_entry(e, n, slot, colision) {
if (OVERLAPS) {
wake_up(&mdev->misc_wait);
break;
}
}
}
#undef OVERLAPS
}
void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m)
{
trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL);
bio_endio(m->bio, m->error);
dec_ap_bio(mdev);
}
/* Helper for __req_mod().
* Set m->bio to the master bio, if it is fit to be completed,
* or leave it alone (it is initialized to NULL in __req_mod),
* if it has already been completed, or cannot be completed yet.
* If m->bio is set, the error status to be returned is placed in m->error.
*/
void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
{
const unsigned long s = req->rq_state;
struct drbd_conf *mdev = req->mdev;
/* only WRITES may end up here without a master bio (on barrier ack) */
int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
trace_drbd_req(req, nothing, "_req_may_be_done");
/* we must not complete the master bio, while it is
* still being processed by _drbd_send_zc_bio (drbd_send_dblock)
* not yet acknowledged by the peer
* not yet completed by the local io subsystem
* these flags may get cleared in any order by
* the worker,
* the receiver,
* the bio_endio completion callbacks.
*/
if (s & RQ_NET_QUEUED)
return;
if (s & RQ_NET_PENDING)
return;
if (s & RQ_LOCAL_PENDING)
return;
if (req->master_bio) {
/* this is data_received (remote read)
* or protocol C P_WRITE_ACK
* or protocol B P_RECV_ACK
* or protocol A "handed_over_to_network" (SendAck)
* or canceled or failed,
* or killed from the transfer log due to connection loss.
*/
/*
* figure out whether to report success or failure.
*
* report success when at least one of the operations succeeded.
* or, to put the other way,
* only report failure, when both operations failed.
*
* what to do about the failures is handled elsewhere.
* what we need to do here is just: complete the master_bio.
*
* local completion error, if any, has been stored as ERR_PTR
* in private_bio within drbd_endio_pri.
*/
int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
int error = PTR_ERR(req->private_bio);
/* remove the request from the conflict detection
* respective block_id verification hash */
if (!hlist_unhashed(&req->colision))
hlist_del(&req->colision);
else
D_ASSERT((s & RQ_NET_MASK) == 0);
/* for writes we need to do some extra housekeeping */
if (rw == WRITE)
_about_to_complete_local_write(mdev, req);
/* Update disk stats */
_drbd_end_io_acct(mdev, req);
m->error = ok ? 0 : (error ?: -EIO);
m->bio = req->master_bio;
req->master_bio = NULL;
}
if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
/* this is disconnected (local only) operation,
* or protocol C P_WRITE_ACK,
* or protocol A or B P_BARRIER_ACK,
* or killed from the transfer log due to connection loss. */
_req_is_done(mdev, req, rw);
}
/* else: network part and not DONE yet. that is
* protocol A or B, barrier ack still pending... */
}
/*
* checks whether there was an overlapping request
* or ee already registered.
*
* if so, return 1, in which case this request is completed on the spot,
* without ever being submitted or send.
*
* return 0 if it is ok to submit this request.
*
* NOTE:
* paranoia: assume something above us is broken, and issues different write
* requests for the same block simultaneously...
*
* To ensure these won't be reordered differently on both nodes, resulting in
* diverging data sets, we discard the later one(s). Not that this is supposed
* to happen, but this is the rationale why we also have to check for
* conflicting requests with local origin, and why we have to do so regardless
* of whether we allowed multiple primaries.
*
* BTW, in case we only have one primary, the ee_hash is empty anyways, and the
* second hlist_for_each_entry becomes a noop. This is even simpler than to
* grab a reference on the net_conf, and check for the two_primaries flag...
*/
static int _req_conflicts(struct drbd_request *req)
{
struct drbd_conf *mdev = req->mdev;
const sector_t sector = req->sector;
const int size = req->size;
struct drbd_request *i;
struct drbd_epoch_entry *e;
struct hlist_node *n;
struct hlist_head *slot;
D_ASSERT(hlist_unhashed(&req->colision));
if (!get_net_conf(mdev))
return 0;
/* BUG_ON */
ERR_IF (mdev->tl_hash_s == 0)
goto out_no_conflict;
BUG_ON(mdev->tl_hash == NULL);
#define OVERLAPS overlaps(i->sector, i->size, sector, size)
slot = tl_hash_slot(mdev, sector);
hlist_for_each_entry(i, n, slot, colision) {
if (OVERLAPS) {
dev_alert(DEV, "%s[%u] Concurrent local write detected! "
"[DISCARD L] new: %llus +%u; "
"pending: %llus +%u\n",
current->comm, current->pid,
(unsigned long long)sector, size,
(unsigned long long)i->sector, i->size);
goto out_conflict;
}
}
if (mdev->ee_hash_s) {
/* now, check for overlapping requests with remote origin */
BUG_ON(mdev->ee_hash == NULL);
#undef OVERLAPS
#define OVERLAPS overlaps(e->sector, e->size, sector, size)
slot = ee_hash_slot(mdev, sector);
hlist_for_each_entry(e, n, slot, colision) {
if (OVERLAPS) {
dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
" [DISCARD L] new: %llus +%u; "
"pending: %llus +%u\n",
current->comm, current->pid,
(unsigned long long)sector, size,
(unsigned long long)e->sector, e->size);
goto out_conflict;
}
}
}
#undef OVERLAPS
out_no_conflict:
/* this is like it should be, and what we expected.
* our users do behave after all... */
put_net_conf(mdev);
return 0;
out_conflict:
put_net_conf(mdev);
return 1;
}
/* obviously this could be coded as many single functions
* instead of one huge switch,
* or by putting the code directly in the respective locations
* (as it has been before).
*
* but having it this way
* enforces that it is all in this one place, where it is easier to audit,
* it makes it obvious that whatever "event" "happens" to a request should
* happen "atomically" within the req_lock,
* and it enforces that we have to think in a very structured manner
* about the "events" that may happen to a request during its life time ...
*/
void __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
struct drbd_conf *mdev = req->mdev;
m->bio = NULL;
trace_drbd_req(req, what, NULL);
switch (what) {
default:
dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
break;
/* does not happen...
* initialization done in drbd_req_new
case created:
break;
*/
case to_be_send: /* via network */
/* reached via drbd_make_request_common
* and from w_read_retry_remote */
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
req->rq_state |= RQ_NET_PENDING;
inc_ap_pending(mdev);
break;
case to_be_submitted: /* locally */
/* reached via drbd_make_request_common */
D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
req->rq_state |= RQ_LOCAL_PENDING;
break;
case completed_ok:
if (bio_data_dir(req->master_bio) == WRITE)
mdev->writ_cnt += req->size>>9;
else
mdev->read_cnt += req->size>>9;
req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done(req, m);
put_ldev(mdev);
break;
case write_completed_with_error:
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n",
(unsigned long long)req->sector, req->size);
/* and now: check how to handle local io error. */
__drbd_chk_io_error(mdev, FALSE);
_req_may_be_done(req, m);
put_ldev(mdev);
break;
case read_ahead_completed_with_error:
/* it is legal to fail READA */
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
_req_may_be_done(req, m);
put_ldev(mdev);
break;
case read_completed_with_error:
drbd_set_out_of_sync(mdev, req->sector, req->size);
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
dev_alert(DEV, "Local READ failed sec=%llus size=%u\n",
(unsigned long long)req->sector, req->size);
/* _req_mod(req,to_be_send); oops, recursion... */
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
req->rq_state |= RQ_NET_PENDING;
inc_ap_pending(mdev);
__drbd_chk_io_error(mdev, FALSE);
put_ldev(mdev);
/* NOTE: if we have no connection,
* or know the peer has no good data either,
* then we don't actually need to "queue_for_net_read",
* but we do so anyways, since the drbd_io_error()
* and the potential state change to "Diskless"
* needs to be done from process context */
/* fall through: _req_mod(req,queue_for_net_read); */
case queue_for_net_read:
/* READ or READA, and
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
/* from drbd_make_request_common
* or from bio_endio during read io-error recovery */
/* so we can verify the handle in the answer packet
* corresponding hlist_del is in _req_may_be_done() */
hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector));
set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
req->rq_state |= RQ_NET_QUEUED;
req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
? w_read_retry_remote
: w_send_read_req;
drbd_queue_work(&mdev->data.work, &req->w);
break;
case queue_for_net_write:
/* assert something? */
/* from drbd_make_request_common only */
hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector));
/* corresponding hlist_del is in _req_may_be_done() */
/* NOTE
* In case the req ended up on the transfer log before being
* queued on the worker, it could lead to this request being
* missed during cleanup after connection loss.
* So we have to do both operations here,
* within the same lock that protects the transfer log.
*
* _req_add_to_epoch(req); this has to be after the
* _maybe_start_new_epoch(req); which happened in
* drbd_make_request_common, because we now may set the bit
* again ourselves to close the current epoch.
*
* Add req to the (now) current epoch (barrier). */
/* see drbd_make_request_common,
* just after it grabs the req_lock */
D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
req->epoch = mdev->newest_tle->br_number;
list_add_tail(&req->tl_requests,
&mdev->newest_tle->requests);
/* increment size of current epoch */
mdev->newest_tle->n_req++;
/* queue work item to send data */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
req->rq_state |= RQ_NET_QUEUED;
req->w.cb = w_send_dblock;
drbd_queue_work(&mdev->data.work, &req->w);
/* close the epoch, in case it outgrew the limit */
if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
queue_barrier(mdev);
break;
case send_canceled:
/* treat it the same */
case send_failed:
/* real cleanup will be done from tl_clear. just update flags
* so it is no longer marked as on the worker queue */
req->rq_state &= ~RQ_NET_QUEUED;
/* if we did it right, tl_clear should be scheduled only after
* this, so this should not be necessary! */
_req_may_be_done(req, m);
break;
case handed_over_to_network:
/* assert something? */
if (bio_data_dir(req->master_bio) == WRITE &&
mdev->net_conf->wire_protocol == DRBD_PROT_A) {
/* this is what is dangerous about protocol A:
* pretend it was successfully written on the peer. */
if (req->rq_state & RQ_NET_PENDING) {
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= RQ_NET_OK;
} /* else: neg-ack was faster... */
/* it is still not yet RQ_NET_DONE until the
* corresponding epoch barrier got acked as well,
* so we know what to dirty on connection loss */
}
req->rq_state &= ~RQ_NET_QUEUED;
req->rq_state |= RQ_NET_SENT;
/* because _drbd_send_zc_bio could sleep, and may want to
* dereference the bio even after the "write_acked_by_peer" and
* "completed_ok" events came in, once we return from
* _drbd_send_zc_bio (drbd_send_dblock), we have to check
* whether it is done already, and end it. */
_req_may_be_done(req, m);
break;
case connection_lost_while_pending:
/* transfer log cleanup after connection loss */
/* assert something? */
if (req->rq_state & RQ_NET_PENDING)
dec_ap_pending(mdev);
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
req->rq_state |= RQ_NET_DONE;
/* if it is still queued, we may not complete it here.
* it will be canceled soon. */
if (!(req->rq_state & RQ_NET_QUEUED))
_req_may_be_done(req, m);
break;
case write_acked_by_peer_and_sis:
req->rq_state |= RQ_NET_SIS;
case conflict_discarded_by_peer:
/* for discarded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl, potential
* node crashes are covered by the activity log. */
if (what == conflict_discarded_by_peer)
dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
" DRBD is not a random data generator!\n",
(unsigned long long)req->sector, req->size);
req->rq_state |= RQ_NET_DONE;
/* fall through */
case write_acked_by_peer:
/* protocol C; successfully written on peer.
* Nothing to do here.
* We want to keep the tl in place for all protocols, to cater
* for volatile write-back caches on lower level devices.
*
* A barrier request is expected to have forced all prior
* requests onto stable storage, so completion of a barrier
* request could set NET_DONE right here, and not wait for the
* P_BARRIER_ACK, but that is an unnecessary optimization. */
/* this makes it effectively the same as for: */
case recv_acked_by_peer:
/* protocol B; pretends to be successfully written on peer.
* see also notes above in handed_over_to_network about
* protocol != C */
req->rq_state |= RQ_NET_OK;
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
_req_may_be_done(req, m);
break;
case neg_acked:
/* assert something? */
if (req->rq_state & RQ_NET_PENDING)
dec_ap_pending(mdev);
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
req->rq_state |= RQ_NET_DONE;
_req_may_be_done(req, m);
/* else: done by handed_over_to_network */
break;
case barrier_acked:
if (req->rq_state & RQ_NET_PENDING) {
/* barrier came in before all requests have been acked.
* this is bad, because if the connection is lost now,
* we won't be able to clean them up... */
dev_err(DEV, "FIXME (barrier_acked but pending)\n");
trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)");
list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
}
D_ASSERT(req->rq_state & RQ_NET_SENT);
req->rq_state |= RQ_NET_DONE;
_req_may_be_done(req, m);
break;
case data_received:
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
_req_may_be_done(req, m);
break;
};
}
/* we may do a local read if:
* - we are consistent (of course),
* - or we are generally inconsistent,
* BUT we are still/already IN SYNC for this area.
* since size may be bigger than BM_BLOCK_SIZE,
* we may need to check several bits.
*/
static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
{
unsigned long sbnr, ebnr;
sector_t esector, nr_sectors;
if (mdev->state.disk == D_UP_TO_DATE)
return 1;
if (mdev->state.disk >= D_OUTDATED)
return 0;
if (mdev->state.disk < D_INCONSISTENT)
return 0;
/* state.disk == D_INCONSISTENT We will have a look at the BitMap */
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
D_ASSERT(sector < nr_sectors);
D_ASSERT(esector < nr_sectors);
sbnr = BM_SECT_TO_BIT(sector);
ebnr = BM_SECT_TO_BIT(esector);
return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
}
static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
{
const int rw = bio_rw(bio);
const int size = bio->bi_size;
const sector_t sector = bio->bi_sector;
struct drbd_tl_epoch *b = NULL;
struct drbd_request *req;
int local, remote;
int err = -EIO;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
if (!req) {
dec_ap_bio(mdev);
/* only pass the error to the upper layers.
* if user cannot handle io errors, that's not our business. */
dev_err(DEV, "could not kmalloc() req\n");
bio_endio(bio, -ENOMEM);
return 0;
}
trace_drbd_bio(mdev, "Rq", bio, 0, req);
local = get_ldev(mdev);
if (!local) {
bio_put(req->private_bio); /* or we get a bio leak */
req->private_bio = NULL;
}
if (rw == WRITE) {
remote = 1;
} else {
/* READ || READA */
if (local) {
if (!drbd_may_do_local_read(mdev, sector, size)) {
/* we could kick the syncer to
* sync this extent asap, wait for
* it, then continue locally.
* Or just issue the request remotely.
*/
local = 0;
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
}
}
remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
}
/* If we have a disk, but a READA request is mapped to remote,
* we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
* Just fail that READA request right here.
*
* THINK: maybe fail all READA when not local?
* or make this configurable...
* if network is slow, READA won't do any good.
*/
if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
err = -EWOULDBLOCK;
goto fail_and_free_req;
}
/* For WRITES going to the local disk, grab a reference on the target
* extent. This waits for any resync activity in the corresponding
* resync extent to finish, and, if necessary, pulls in the target
* extent into the activity log, which involves further disk io because
* of transactional on-disk meta data updates. */
if (rw == WRITE && local)
drbd_al_begin_io(mdev, sector);
remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
(mdev->state.pdsk == D_INCONSISTENT &&
mdev->state.conn >= C_CONNECTED));
if (!(local || remote)) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
goto fail_free_complete;
}
/* For WRITE request, we have to make sure that we have an
* unused_spare_tle, in case we need to start a new epoch.
* I try to be smart and avoid to pre-allocate always "just in case",
* but there is a race between testing the bit and pointer outside the
* spinlock, and grabbing the spinlock.
* if we lost that race, we retry. */
if (rw == WRITE && remote &&
mdev->unused_spare_tle == NULL &&
test_bit(CREATE_BARRIER, &mdev->flags)) {
allocate_barrier:
b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
if (!b) {
dev_err(DEV, "Failed to alloc barrier.\n");
err = -ENOMEM;
goto fail_free_complete;
}
}
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->req_lock);
if (remote) {
remote = (mdev->state.pdsk == D_UP_TO_DATE ||
(mdev->state.pdsk == D_INCONSISTENT &&
mdev->state.conn >= C_CONNECTED));
if (!remote)
dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
if (!(local || remote)) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
spin_unlock_irq(&mdev->req_lock);
goto fail_free_complete;
}
}
if (b && mdev->unused_spare_tle == NULL) {
mdev->unused_spare_tle = b;
b = NULL;
}
if (rw == WRITE && remote &&
mdev->unused_spare_tle == NULL &&
test_bit(CREATE_BARRIER, &mdev->flags)) {
/* someone closed the current epoch
* while we were grabbing the spinlock */
spin_unlock_irq(&mdev->req_lock);
goto allocate_barrier;
}
/* Update disk stats */
_drbd_start_io_acct(mdev, req, bio);
/* _maybe_start_new_epoch(mdev);
* If we need to generate a write barrier packet, we have to add the
* new epoch (barrier) object, and queue the barrier packet for sending,
* and queue the req's data after it _within the same lock_, otherwise
* we have race conditions were the reorder domains could be mixed up.
*
* Even read requests may start a new epoch and queue the corresponding
* barrier packet. To get the write ordering right, we only have to
* make sure that, if this is a write request and it triggered a
* barrier packet, this request is queued within the same spinlock. */
if (remote && mdev->unused_spare_tle &&
test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
_tl_add_barrier(mdev, mdev->unused_spare_tle);
mdev->unused_spare_tle = NULL;
} else {
D_ASSERT(!(remote && rw == WRITE &&
test_bit(CREATE_BARRIER, &mdev->flags)));
}
/* NOTE
* Actually, 'local' may be wrong here already, since we may have failed
* to write to the meta data, and may become wrong anytime because of
* local io-error for some other request, which would lead to us
* "detaching" the local disk.
*
* 'remote' may become wrong any time because the network could fail.
*
* This is a harmless race condition, though, since it is handled
* correctly at the appropriate places; so it just defers the failure
* of the respective operation.
*/
/* mark them early for readability.
* this just sets some state flags. */
if (remote)
_req_mod(req, to_be_send);
if (local)
_req_mod(req, to_be_submitted);
/* check this request on the collision detection hash tables.
* if we have a conflict, just complete it here.
* THINK do we want to check reads, too? (I don't think so...) */
if (rw == WRITE && _req_conflicts(req)) {
/* this is a conflicting request.
* even though it may have been only _partially_
* overlapping with one of the currently pending requests,
* without even submitting or sending it, we will
* pretend that it was successfully served right now.
*/
if (local) {
bio_put(req->private_bio);
req->private_bio = NULL;
drbd_al_complete_io(mdev, req->sector);
put_ldev(mdev);
local = 0;
}
if (remote)
dec_ap_pending(mdev);
_drbd_end_io_acct(mdev, req);
/* THINK: do we want to fail it (-EIO), or pretend success? */
bio_endio(req->master_bio, 0);
req->master_bio = NULL;
dec_ap_bio(mdev);
drbd_req_free(req);
remote = 0;
}
/* NOTE remote first: to get the concurrent write detection right,
* we must register the request before start of local IO. */
if (remote) {
/* either WRITE and C_CONNECTED,
* or READ, and no local disk,
* or READ, but not in sync.
*/
_req_mod(req, (rw == WRITE)
? queue_for_net_write
: queue_for_net_read);
}
spin_unlock_irq(&mdev->req_lock);
kfree(b); /* if someone else has beaten us to it... */
if (local) {
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL);
if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
: rw == READ ? DRBD_FAULT_DT_RD
: DRBD_FAULT_DT_RA))
bio_endio(req->private_bio, -EIO);
else
generic_make_request(req->private_bio);
}
/* we need to plug ALWAYS since we possibly need to kick lo_dev.
* we plug after submit, so we won't miss an unplug event */
drbd_plug_device(mdev);
return 0;
fail_free_complete:
if (rw == WRITE && local)
drbd_al_complete_io(mdev, sector);
fail_and_free_req:
if (local) {
bio_put(req->private_bio);
req->private_bio = NULL;
put_ldev(mdev);
}
bio_endio(bio, err);
drbd_req_free(req);
dec_ap_bio(mdev);
kfree(b);
return 0;
}
/* helper function for drbd_make_request
* if we can determine just by the mdev (state) that this request will fail,
* return 1
* otherwise return 0
*/
static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
{
/* Unconfigured */
if (mdev->state.conn == C_DISCONNECTING &&
mdev->state.disk == D_DISKLESS)
return 1;
if (mdev->state.role != R_PRIMARY &&
(!allow_oos || is_write)) {
if (__ratelimit(&drbd_ratelimit_state)) {
dev_err(DEV, "Process %s[%u] tried to %s; "
"since we are not in Primary state, "
"we cannot allow this\n",
current->comm, current->pid,
is_write ? "WRITE" : "READ");
}
return 1;
}
/*
* Paranoia: we might have been primary, but sync target, or
* even diskless, then lost the connection.
* This should have been handled (panic? suspend?) somewhere
* else. But maybe it was not, so check again here.
* Caution: as long as we do not have a read/write lock on mdev,
* to serialize state changes, this is racy, since we may lose
* the connection *after* we test for the cstate.
*/
if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
return 1;
}
return 0;
}
int drbd_make_request_26(struct request_queue *q, struct bio *bio)
{
unsigned int s_enr, e_enr;
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
bio_endio(bio, -EPERM);
return 0;
}
/* Reject barrier requests if we know the underlying device does
* not support them.
* XXX: Need to get this info from peer as well some how so we
* XXX: reject if EITHER side/data/metadata area does not support them.
*
* because of those XXX, this is not yet enabled,
* i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit.
*/
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) {
/* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
/*
* what we "blindly" assume:
*/
D_ASSERT(bio->bi_size > 0);
D_ASSERT((bio->bi_size & 0x1ff) == 0);
D_ASSERT(bio->bi_idx == 0);
/* to make some things easier, force alignment of requests within the
* granularity of our hash tables */
s_enr = bio->bi_sector >> HT_SHIFT;
e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
if (likely(s_enr == e_enr)) {
inc_ap_bio(mdev, 1);
return drbd_make_request_common(mdev, bio);
}
/* can this bio be split generically?
* Maybe add our own split-arbitrary-bios function. */
if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) {
/* rather error out here than BUG in bio_split */
dev_err(DEV, "bio would need to, but cannot, be split: "
"(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
bio->bi_vcnt, bio->bi_idx, bio->bi_size,
(unsigned long long)bio->bi_sector);
bio_endio(bio, -EINVAL);
} else {
/* This bio crosses some boundary, so we have to split it. */
struct bio_pair *bp;
/* works for the "do not cross hash slot boundaries" case
* e.g. sector 262269, size 4096
* s_enr = 262269 >> 6 = 4097
* e_enr = (262269+8-1) >> 6 = 4098
* HT_SHIFT = 6
* sps = 64, mask = 63
* first_sectors = 64 - (262269 & 63) = 3
*/
const sector_t sect = bio->bi_sector;
const int sps = 1 << HT_SHIFT; /* sectors per slot */
const int mask = sps - 1;
const sector_t first_sectors = sps - (sect & mask);
bp = bio_split(bio,
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28)
bio_split_pool,
#endif
first_sectors);
/* we need to get a "reference count" (ap_bio_cnt)
* to avoid races with the disconnect/reconnect/suspend code.
* In case we need to split the bio here, we need to get two references
* atomically, otherwise we might deadlock when trying to submit the
* second one! */
inc_ap_bio(mdev, 2);
D_ASSERT(e_enr == s_enr + 1);
drbd_make_request_common(mdev, &bp->bio1);
drbd_make_request_common(mdev, &bp->bio2);
bio_pair_release(bp);
}
return 0;
}
/* This is called by bio_add_page(). With this function we reduce
* the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs
* units (was AL_EXTENTs).
*
* we do the calculation within the lower 32bit of the byte offsets,
* since we don't care for actual offset, but only check whether it
* would cross "activity log extent" boundaries.
*
* As long as the BIO is empty we have to allow at least one bvec,
* regardless of size and offset. so the resulting bio may still
* cross extent boundaries. those are dealt with (bio_split) in
* drbd_make_request_26.
*/
int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
{
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
unsigned int bio_offset =
(unsigned int)bvm->bi_sector << 9; /* 32 bit */
unsigned int bio_size = bvm->bi_size;
int limit, backing_limit;
limit = DRBD_MAX_SEGMENT_SIZE
- ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size);
if (limit < 0)
limit = 0;
if (bio_size == 0) {
if (limit <= bvec->bv_len)
limit = bvec->bv_len;
} else if (limit && get_ldev(mdev)) {
struct request_queue * const b =
mdev->ldev->backing_bdev->bd_disk->queue;
if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
backing_limit = b->merge_bvec_fn(b, bvm, bvec);
limit = min(limit, backing_limit);
}
put_ldev(mdev);
}
return limit;
}
/*
drbd_req.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
DRBD is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
DRBD is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_REQ_H
#define _DRBD_REQ_H
#include <linux/autoconf.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/drbd.h>
#include "drbd_int.h"
#include "drbd_wrappers.h"
/* The request callbacks will be called in irq context by the IDE drivers,
and in Softirqs/Tasklets/BH context by the SCSI drivers,
and by the receiver and worker in kernel-thread context.
Try to get the locking right :) */
/*
* Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
* associated with IO requests originating from the block layer above us.
*
* There are quite a few things that may happen to a drbd request
* during its lifetime.
*
* It will be created.
* It will be marked with the intention to be
* submitted to local disk and/or
* send via the network.
*
* It has to be placed on the transfer log and other housekeeping lists,
* In case we have a network connection.
*
* It may be identified as a concurrent (write) request
* and be handled accordingly.
*
* It may me handed over to the local disk subsystem.
* It may be completed by the local disk subsystem,
* either sucessfully or with io-error.
* In case it is a READ request, and it failed locally,
* it may be retried remotely.
*
* It may be queued for sending.
* It may be handed over to the network stack,
* which may fail.
* It may be acknowledged by the "peer" according to the wire_protocol in use.
* this may be a negative ack.
* It may receive a faked ack when the network connection is lost and the
* transfer log is cleaned up.
* Sending may be canceled due to network connection loss.
* When it finally has outlived its time,
* corresponding dirty bits in the resync-bitmap may be cleared or set,
* it will be destroyed,
* and completion will be signalled to the originator,
* with or without "success".
*/
enum drbd_req_event {
created,
to_be_send,
to_be_submitted,
/* XXX yes, now I am inconsistent...
* these two are not "events" but "actions"
* oh, well... */
queue_for_net_write,
queue_for_net_read,
send_canceled,
send_failed,
handed_over_to_network,
connection_lost_while_pending,
recv_acked_by_peer,
write_acked_by_peer,
write_acked_by_peer_and_sis, /* and set_in_sync */
conflict_discarded_by_peer,
neg_acked,
barrier_acked, /* in protocol A and B */
data_received, /* (remote read) */
read_completed_with_error,
read_ahead_completed_with_error,
write_completed_with_error,
completed_ok,
nothing, /* for tracing only */
};
/* encoding of request states for now. we don't actually need that many bits.
* we don't need to do atomic bit operations either, since most of the time we
* need to look at the connection state and/or manipulate some lists at the
* same time, so we should hold the request lock anyways.
*/
enum drbd_req_state_bits {
/* 210
* 000: no local possible
* 001: to be submitted
* UNUSED, we could map: 011: submitted, completion still pending
* 110: completed ok
* 010: completed with error
*/
__RQ_LOCAL_PENDING,
__RQ_LOCAL_COMPLETED,
__RQ_LOCAL_OK,
/* 76543
* 00000: no network possible
* 00001: to be send
* 00011: to be send, on worker queue
* 00101: sent, expecting recv_ack (B) or write_ack (C)
* 11101: sent,
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
* 11100: write_acked (C),
* data_received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
* or neg-d-acked (read, any protocol)
* or killed from the transfer log
* during cleanup after connection loss
* request can be freed
* 01000: canceled or send failed...
* request can be freed
*/
/* if "SENT" is not set, yet, this can still fail or be canceled.
* if "SENT" is set already, we still wait for an Ack packet.
* when cleared, the master_bio may be completed.
* in (B,A) the request object may still linger on the transaction log
* until the corresponding barrier ack comes in */
__RQ_NET_PENDING,
/* If it is QUEUED, and it is a WRITE, it is also registered in the
* transfer log. Currently we need this flag to avoid conflicts between
* worker canceling the request and tl_clear_barrier killing it from
* transfer log. We should restructure the code so this conflict does
* no longer occur. */
__RQ_NET_QUEUED,
/* well, actually only "handed over to the network stack".
*
* TODO can potentially be dropped because of the similar meaning
* of RQ_NET_SENT and ~RQ_NET_QUEUED.
* however it is not exactly the same. before we drop it
* we must ensure that we can tell a request with network part
* from a request without, regardless of what happens to it. */
__RQ_NET_SENT,
/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
* basically this means the corresponding P_BARRIER_ACK was received */
__RQ_NET_DONE,
/* whether or not we know (C) or pretend (B,A) that the write
* was successfully written on the peer.
*/
__RQ_NET_OK,
/* peer called drbd_set_in_sync() for this write */
__RQ_NET_SIS,
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
#define RQ_NET_OK (1UL << __RQ_NET_OK)
#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->ee_hash_s == 0);
return mdev->ee_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
}
/* transfer log (drbd_request objects) */
static inline
struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
BUG_ON(mdev->tl_hash_s == 0);
return mdev->tl_hash +
((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
}
/* application reads (drbd_request objects) */
static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
{
return mdev->app_reads_hash
+ ((unsigned int)(sector) % APP_R_HSIZE);
}
/* when we receive the answer for a read request,
* verify that we actually know about it */
static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
u64 id, sector_t sector)
{
struct hlist_head *slot = ar_hash_slot(mdev, sector);
struct hlist_node *n;
struct drbd_request *req;
hlist_for_each_entry(req, n, slot, colision) {
if ((unsigned long)req == (unsigned long)id) {
D_ASSERT(req->sector == sector);
return req;
}
}
return NULL;
}
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
struct bio *bio;
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
req->rq_state = 0;
req->mdev = mdev;
req->master_bio = bio_src;
req->private_bio = bio;
req->epoch = 0;
req->sector = bio->bi_sector;
req->size = bio->bi_size;
req->start_time = jiffies;
INIT_HLIST_NODE(&req->colision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
bio->bi_private = req;
bio->bi_end_io = drbd_endio_pri;
bio->bi_next = NULL;
}
return req;
}
static inline void drbd_req_free(struct drbd_request *req)
{
mempool_free(req, drbd_request_mempool);
}
static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
{
return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
}
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_size, or similar. But that would be too ugly. */
struct bio_and_error {
struct bio *bio;
int error;
};
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
/* __req_mod possibly frees req, do not touch req after that! */
__req_mod(req, what, &m);
if (m.bio)
complete_master_bio(mdev, &m);
}
/* completion of master bio is outside of spinlock.
* If you need it irqsave, do it your self! */
static inline void req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
spin_lock_irq(&mdev->req_lock);
__req_mod(req, what, &m);
spin_unlock_irq(&mdev->req_lock);
if (m.bio)
complete_master_bio(mdev, &m);
}
#endif
/*
drbd.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/drbd.h>
static const char *drbd_conn_s_names[] = {
[C_STANDALONE] = "StandAlone",
[C_DISCONNECTING] = "Disconnecting",
[C_UNCONNECTED] = "Unconnected",
[C_TIMEOUT] = "Timeout",
[C_BROKEN_PIPE] = "BrokenPipe",
[C_NETWORK_FAILURE] = "NetworkFailure",
[C_PROTOCOL_ERROR] = "ProtocolError",
[C_WF_CONNECTION] = "WFConnection",
[C_WF_REPORT_PARAMS] = "WFReportParams",
[C_TEAR_DOWN] = "TearDown",
[C_CONNECTED] = "Connected",
[C_STARTING_SYNC_S] = "StartingSyncS",
[C_STARTING_SYNC_T] = "StartingSyncT",
[C_WF_BITMAP_S] = "WFBitMapS",
[C_WF_BITMAP_T] = "WFBitMapT",
[C_WF_SYNC_UUID] = "WFSyncUUID",
[C_SYNC_SOURCE] = "SyncSource",
[C_SYNC_TARGET] = "SyncTarget",
[C_PAUSED_SYNC_S] = "PausedSyncS",
[C_PAUSED_SYNC_T] = "PausedSyncT",
[C_VERIFY_S] = "VerifyS",
[C_VERIFY_T] = "VerifyT",
};
static const char *drbd_role_s_names[] = {
[R_PRIMARY] = "Primary",
[R_SECONDARY] = "Secondary",
[R_UNKNOWN] = "Unknown"
};
static const char *drbd_disk_s_names[] = {
[D_DISKLESS] = "Diskless",
[D_ATTACHING] = "Attaching",
[D_FAILED] = "Failed",
[D_NEGOTIATING] = "Negotiating",
[D_INCONSISTENT] = "Inconsistent",
[D_OUTDATED] = "Outdated",
[D_UNKNOWN] = "DUnknown",
[D_CONSISTENT] = "Consistent",
[D_UP_TO_DATE] = "UpToDate",
};
static const char *drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
[-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
[-SS_DEVICE_IN_USE] = "Device is held open by someone",
[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
};
const char *drbd_conn_str(enum drbd_conns s)
{
/* enums are unsigned... */
return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s];
}
const char *drbd_role_str(enum drbd_role s)
{
return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
}
const char *drbd_disk_str(enum drbd_disk_state s)
{
return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
}
const char *drbd_set_st_err_str(enum drbd_state_ret_codes err)
{
return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
err > SS_TWO_PRIMARIES ? "TOO_LARGE"
: drbd_state_sw_errors[-err];
}
/*
drbd_tracing.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/module.h>
#include <linux/drbd.h>
#include <linux/ctype.h>
#include "drbd_int.h"
#include "drbd_tracing.h"
#include <linux/drbd_tag_magic.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg");
MODULE_DESCRIPTION("DRBD tracepoint probes");
MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c");
MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)");
MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)");
unsigned int trace_mask = 0; /* Bitmap of events to trace */
int trace_level; /* Current trace level */
int trace_devs; /* Bitmap of devices to trace */
module_param(trace_mask, uint, 0444);
module_param(trace_level, int, 0644);
module_param(trace_devs, int, 0644);
enum {
TRACE_PACKET = 0x0001,
TRACE_RQ = 0x0002,
TRACE_UUID = 0x0004,
TRACE_RESYNC = 0x0008,
TRACE_EE = 0x0010,
TRACE_UNPLUG = 0x0020,
TRACE_NL = 0x0040,
TRACE_AL_EXT = 0x0080,
TRACE_INT_RQ = 0x0100,
TRACE_MD_IO = 0x0200,
TRACE_EPOCH = 0x0400,
};
/* Buffer printing support
* dbg_print_flags: used for Flags arg to drbd_print_buffer
* - DBGPRINT_BUFFADDR; if set, each line starts with the
* virtual address of the line being output. If clear,
* each line starts with the offset from the beginning
* of the buffer. */
enum dbg_print_flags {
DBGPRINT_BUFFADDR = 0x0001,
};
/* Macro stuff */
static char *nl_packet_name(int packet_type)
{
/* Generate packet type strings */
#define NL_PACKET(name, number, fields) \
[P_ ## name] = # name,
#define NL_INTEGER Argh!
#define NL_BIT Argh!
#define NL_INT64 Argh!
#define NL_STRING Argh!
static char *nl_tag_name[P_nl_after_last_packet] = {
#include "linux/drbd_nl.h"
};
return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ?
nl_tag_name[packet_type] : "*Unknown*";
}
/* /Macro stuff */
static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level)
{
return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs);
}
static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg)
{
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt));
}
static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
{
static char *uuid_str[UI_EXTENDED_SIZE] = {
[UI_CURRENT] = "CURRENT",
[UI_BITMAP] = "BITMAP",
[UI_HISTORY_START] = "HISTORY_START",
[UI_HISTORY_END] = "HISTORY_END",
[UI_SIZE] = "SIZE",
[UI_FLAGS] = "FLAGS",
};
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
if (index >= UI_EXTENDED_SIZE) {
dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
return;
}
dev_info(DEV, " uuid[%s] now %016llX\n",
uuid_str[index],
(unsigned long long)mdev->ldev->md.uuid[index]);
}
static void probe_drbd_md_io(struct drbd_conf *mdev, int rw,
struct drbd_backing_dev *bdev)
{
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
dev_info(DEV, " %s metadata superblock now\n",
rw == READ ? "Reading" : "Writing");
}
static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg)
{
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n",
msg, (unsigned long long)e->sector, e->size, e);
}
static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch,
enum epoch_event ev)
{
static char *epoch_event_str[] = {
[EV_PUT] = "put",
[EV_GOT_BARRIER_NR] = "got_barrier_nr",
[EV_BARRIER_DONE] = "barrier_done",
[EV_BECAME_LAST] = "became_last",
[EV_TRACE_FLUSH] = "issuing_flush",
[EV_TRACE_ADD_BARRIER] = "added_barrier",
[EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch",
};
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
ev &= ~EV_CLEANUP;
switch (ev) {
case EV_TRACE_ALLOC:
dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs);
break;
case EV_TRACE_FREE:
dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n",
epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
mdev->epochs);
break;
default:
dev_info(DEV, "Update epoch %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n",
epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size),
atomic_read(&epoch->active),
test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-',
test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-',
test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-',
test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-',
epoch_event_str[ev]);
}
}
static void probe_drbd_netlink(void *data, int is_req)
{
struct cn_msg *msg = data;
if (is_req) {
struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data;
printk(KERN_INFO "drbd%d: "
"Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n",
nlp->drbd_minor,
nl_packet_name(nlp->packet_type),
nlp->packet_type,
msg->seq, msg->ack, msg->len);
} else {
struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data;
printk(KERN_INFO "drbd%d: "
"Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n",
nlp->minor,
nlp->packet_type == P_nl_after_last_packet ?
"Empty-Reply" : nl_packet_name(nlp->packet_type),
nlp->packet_type,
msg->seq, msg->ack, msg->len);
}
}
static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg)
{
unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n",
msg, (unsigned long long) sector, enr,
(int)BM_SECT_TO_EXT(sector));
}
/**
* drbd_print_buffer() - Hexdump arbitrary binary data into a buffer
* @prefix: String is output at the beginning of each line output.
* @flags: Currently only defined flag: DBGPRINT_BUFFADDR; if set, each
* line starts with the virtual address of the line being
* output. If clear, each line starts with the offset from the
* beginning of the buffer.
* @size: Indicates the size of each entry in the buffer. Supported
* values are sizeof(char), sizeof(short) and sizeof(int)
* @buffer: Start address of buffer
* @buffer_va: Virtual address of start of buffer (normally the same
* as Buffer, but having it separate allows it to hold
* file address for example)
* @length: length of buffer
*/
static void drbd_print_buffer(const char *prefix, unsigned int flags, int size,
const void *buffer, const void *buffer_va,
unsigned int length)
#define LINE_SIZE 16
#define LINE_ENTRIES (int)(LINE_SIZE/size)
{
const unsigned char *pstart;
const unsigned char *pstart_va;
const unsigned char *pend;
char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8];
char *pbytes = bytes_str, *pascii = ascii_str;
int offset = 0;
long sizemask;
int field_width;
int index;
const unsigned char *pend_str;
const unsigned char *p;
int count;
/* verify size parameter */
if (size != sizeof(char) &&
size != sizeof(short) &&
size != sizeof(int)) {
printk(KERN_DEBUG "drbd_print_buffer: "
"ERROR invalid size %d\n", size);
return;
}
sizemask = size-1;
field_width = size*2;
/* Adjust start/end to be on appropriate boundary for size */
buffer = (const char *)((long)buffer & ~sizemask);
pend = (const unsigned char *)
(((long)buffer + length + sizemask) & ~sizemask);
if (flags & DBGPRINT_BUFFADDR) {
/* Move start back to nearest multiple of line size,
* if printing address. This results in nicely formatted output
* with addresses being on line size (16) byte boundaries */
pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1));
} else {
pstart = (const unsigned char *)buffer;
}
/* Set value of start VA to print if addresses asked for */
pstart_va = (const unsigned char *)buffer_va
- ((const unsigned char *)buffer-pstart);
/* Calculate end position to nicely align right hand side */
pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1));
/* Init strings */
*pbytes = *pascii = '\0';
/* Start at beginning of first line */
p = pstart;
count = 0;
while (p < pend_str) {
if (p < (const unsigned char *)buffer || p >= pend) {
/* Before start of buffer or after end- print spaces */
pbytes += sprintf(pbytes, "%*c ", field_width, ' ');
pascii += sprintf(pascii, "%*c", size, ' ');
p += size;
} else {
/* Add hex and ascii to strings */
int val;
switch (size) {
default:
case 1:
val = *(unsigned char *)p;
break;
case 2:
val = *(unsigned short *)p;
break;
case 4:
val = *(unsigned int *)p;
break;
}
pbytes += sprintf(pbytes, "%0*x ", field_width, val);
for (index = size; index; index--) {
*pascii++ = isprint(*p) ? *p : '.';
p++;
}
}
count++;
if (count == LINE_ENTRIES || p >= pend_str) {
/* Null terminate and print record */
*pascii = '\0';
printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n",
prefix,
(flags & DBGPRINT_BUFFADDR)
? (long)pstart_va:(long)offset,
LINE_ENTRIES*(field_width+1), bytes_str,
LINE_SIZE, ascii_str);
/* Move onto next line */
pstart_va += (p-pstart);
pstart = p;
count = 0;
offset += LINE_SIZE;
/* Re-init strings */
pbytes = bytes_str;
pascii = ascii_str;
*pbytes = *pascii = '\0';
}
}
}
static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args)
{
char str[256];
if (!is_mdev_trace(mdev, level))
return;
if (vsnprintf(str, 256, fmt, args) >= 256)
str[255] = 0;
printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)),
dev_name(disk_to_dev(mdev->vdisk)), str);
}
static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
struct drbd_request *r)
{
#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD)
#define SECTOR_FORMAT "%Lx"
#else
#define SECTOR_FORMAT "%lx"
#endif
#define SECTOR_SHIFT 9
unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT);
char *faddr = (char *)(lowaddr);
char rb[sizeof(void *)*2+6] = { 0, };
struct bio_vec *bvec;
int segno;
const int rw = bio->bi_rw;
const int biorw = (rw & (RW_MASK|RWA_MASK));
const int biobarrier = (rw & (1<<BIO_RW_BARRIER));
const int biosync = (rw & ((1<<BIO_RW_UNPLUG) | (1<<BIO_RW_SYNCIO)));
if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS))
return;
if (r)
sprintf(rb, "Req:%p ", r);
dev_info(DEV, "%s %s:%s%s%s Bio:%p %s- %soffset " SECTOR_FORMAT ", size %x\n",
complete ? "<<<" : ">>>",
pfx,
biorw == WRITE ? "Write" : "Read",
biobarrier ? " : B" : "",
biosync ? " : S" : "",
bio,
rb,
complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "",
bio->bi_sector << SECTOR_SHIFT,
bio->bi_size);
if (trace_level >= TRACE_LVL_METRICS &&
((biorw == WRITE) ^ complete)) {
printk(KERN_DEBUG " ind page offset length\n");
__bio_for_each_segment(bvec, bio, segno, 0) {
printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n", segno,
bvec->bv_page, bvec->bv_offset, bvec->bv_len);
if (trace_level >= TRACE_LVL_ALL) {
char *bvec_buf;
unsigned long flags;
bvec_buf = bvec_kmap_irq(bvec, &flags);
drbd_print_buffer(" ", DBGPRINT_BUFFADDR, 1,
bvec_buf,
faddr,
(bvec->bv_len <= 0x80)
? bvec->bv_len : 0x80);
bvec_kunmap_irq(bvec_buf, &flags);
if (bvec->bv_len > 0x40)
printk(KERN_DEBUG " ....\n");
faddr += bvec->bv_len;
}
}
}
}
static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg)
{
static const char *rq_event_names[] = {
[created] = "created",
[to_be_send] = "to_be_send",
[to_be_submitted] = "to_be_submitted",
[queue_for_net_write] = "queue_for_net_write",
[queue_for_net_read] = "queue_for_net_read",
[send_canceled] = "send_canceled",
[send_failed] = "send_failed",
[handed_over_to_network] = "handed_over_to_network",
[connection_lost_while_pending] =
"connection_lost_while_pending",
[recv_acked_by_peer] = "recv_acked_by_peer",
[write_acked_by_peer] = "write_acked_by_peer",
[neg_acked] = "neg_acked",
[conflict_discarded_by_peer] = "conflict_discarded_by_peer",
[barrier_acked] = "barrier_acked",
[data_received] = "data_received",
[read_completed_with_error] = "read_completed_with_error",
[read_ahead_completed_with_error] = "reada_completed_with_error",
[write_completed_with_error] = "write_completed_with_error",
[completed_ok] = "completed_ok",
};
struct drbd_conf *mdev = req->mdev;
const int rw = (req->master_bio == NULL ||
bio_data_dir(req->master_bio) == WRITE) ?
'W' : 'R';
const unsigned long s = req->rq_state;
if (what != nothing) {
dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]);
} else {
dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n",
msg, req, rw,
s & RQ_LOCAL_PENDING ? 'p' : '-',
s & RQ_LOCAL_COMPLETED ? 'c' : '-',
s & RQ_LOCAL_OK ? 'o' : '-',
s & RQ_NET_PENDING ? 'p' : '-',
s & RQ_NET_QUEUED ? 'q' : '-',
s & RQ_NET_SENT ? 's' : '-',
s & RQ_NET_DONE ? 'd' : '-',
s & RQ_NET_OK ? 'o' : '-',
req->epoch,
(unsigned long long)req->sector,
req->size,
drbd_conn_str(mdev->state.conn));
}
}
#define drbd_peer_str drbd_role_str
#define drbd_pdsk_str drbd_disk_str
#define PSM(A) \
do { \
if (mask.A) { \
int i = snprintf(p, len, " " #A "( %s )", \
drbd_##A##_str(val.A)); \
if (i >= len) \
return op; \
p += i; \
len -= i; \
} \
} while (0)
static char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val)
{
char *op = p;
*p = '\0';
PSM(role);
PSM(peer);
PSM(conn);
PSM(disk);
PSM(pdsk);
return op;
}
#define INFOP(fmt, args...) \
do { \
if (trace_level >= TRACE_LVL_ALL) { \
dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \
file, line, current->comm, current->pid, \
sockname, recv ? "<<<" : ">>>" , \
## args); \
} else { \
dev_info(DEV, "%s %s " fmt, sockname, \
recv ? "<<<" : ">>>" , \
## args); \
} \
} while (0)
static char *_dump_block_id(u64 block_id, char *buff)
{
if (is_syncer_block_id(block_id))
strcpy(buff, "SyncerId");
else
sprintf(buff, "%llx", (unsigned long long)block_id);
return buff;
}
static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock,
int recv, union p_polymorph *p, char *file, int line)
{
char *sockname = sock == mdev->meta.socket ? "meta" : "data";
int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command);
char tmp[300];
union drbd_state m, v;
switch (cmd) {
case P_HAND_SHAKE:
INFOP("%s (protocol %u-%u)\n", cmdname(cmd),
be32_to_cpu(p->handshake.protocol_min),
be32_to_cpu(p->handshake.protocol_max));
break;
case P_BITMAP: /* don't report this */
case P_COMPRESSED_BITMAP: /* don't report this */
break;
case P_DATA:
INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd),
(unsigned long long)be64_to_cpu(p->data.sector),
_dump_block_id(p->data.block_id, tmp),
be32_to_cpu(p->data.seq_num),
be32_to_cpu(p->data.dp_flags)
);
break;
case P_DATA_REPLY:
case P_RS_DATA_REPLY:
INFOP("%s (sector %llus, id %s)\n", cmdname(cmd),
(unsigned long long)be64_to_cpu(p->data.sector),
_dump_block_id(p->data.block_id, tmp)
);
break;
case P_RECV_ACK:
case P_WRITE_ACK:
case P_RS_WRITE_ACK:
case P_DISCARD_ACK:
case P_NEG_ACK:
case P_NEG_RS_DREPLY:
INFOP("%s (sector %llus, size %u, id %s, seq %u)\n",
cmdname(cmd),
(long long)be64_to_cpu(p->block_ack.sector),
be32_to_cpu(p->block_ack.blksize),
_dump_block_id(p->block_ack.block_id, tmp),
be32_to_cpu(p->block_ack.seq_num)
);
break;
case P_DATA_REQUEST:
case P_RS_DATA_REQUEST:
INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd),
(long long)be64_to_cpu(p->block_req.sector),
be32_to_cpu(p->block_req.blksize),
_dump_block_id(p->block_req.block_id, tmp)
);
break;
case P_BARRIER:
case P_BARRIER_ACK:
INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier);
break;
case P_SYNC_PARAM:
case P_SYNC_PARAM89:
INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n",
cmdname(cmd), be32_to_cpu(p->rs_param_89.rate),
p->rs_param_89.verify_alg, p->rs_param_89.csums_alg);
break;
case P_UUIDS:
INFOP("%s Curr:%016llX, Bitmap:%016llX, "
"HisSt:%016llX, HisEnd:%016llX\n",
cmdname(cmd),
(unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]),
(unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]),
(unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]),
(unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END]));
break;
case P_SIZES:
INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, "
"max bio %x, q order %x)\n",
cmdname(cmd),
(long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)),
(long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)),
(long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)),
be32_to_cpu(p->sizes.max_segment_size),
be32_to_cpu(p->sizes.queue_order_type));
break;
case P_STATE:
v.i = be32_to_cpu(p->state.state);
m.i = 0xffffffff;
dump_st(tmp, sizeof(tmp), m, v);
INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp);
break;
case P_STATE_CHG_REQ:
m.i = be32_to_cpu(p->req_state.mask);
v.i = be32_to_cpu(p->req_state.val);
dump_st(tmp, sizeof(tmp), m, v);
INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp);
break;
case P_STATE_CHG_REPLY:
INFOP("%s (ret %x)\n", cmdname(cmd),
be32_to_cpu(p->req_state_reply.retcode));
break;
case P_PING:
case P_PING_ACK:
/*
* Dont trace pings at summary level
*/
if (trace_level < TRACE_LVL_ALL)
break;
/* fall through... */
default:
INFOP("%s (%u)\n", cmdname(cmd), cmd);
break;
}
}
static int __init drbd_trace_init(void)
{
int ret;
if (trace_mask & TRACE_UNPLUG) {
ret = register_trace_drbd_unplug(probe_drbd_unplug);
WARN_ON(ret);
}
if (trace_mask & TRACE_UUID) {
ret = register_trace_drbd_uuid(probe_drbd_uuid);
WARN_ON(ret);
}
if (trace_mask & TRACE_EE) {
ret = register_trace_drbd_ee(probe_drbd_ee);
WARN_ON(ret);
}
if (trace_mask & TRACE_PACKET) {
ret = register_trace_drbd_packet(probe_drbd_packet);
WARN_ON(ret);
}
if (trace_mask & TRACE_MD_IO) {
ret = register_trace_drbd_md_io(probe_drbd_md_io);
WARN_ON(ret);
}
if (trace_mask & TRACE_EPOCH) {
ret = register_trace_drbd_epoch(probe_drbd_epoch);
WARN_ON(ret);
}
if (trace_mask & TRACE_NL) {
ret = register_trace_drbd_netlink(probe_drbd_netlink);
WARN_ON(ret);
}
if (trace_mask & TRACE_AL_EXT) {
ret = register_trace_drbd_actlog(probe_drbd_actlog);
WARN_ON(ret);
}
if (trace_mask & TRACE_RQ) {
ret = register_trace_drbd_bio(probe_drbd_bio);
WARN_ON(ret);
}
if (trace_mask & TRACE_INT_RQ) {
ret = register_trace_drbd_req(probe_drbd_req);
WARN_ON(ret);
}
if (trace_mask & TRACE_RESYNC) {
ret = register_trace__drbd_resync(probe_drbd_resync);
WARN_ON(ret);
}
return 0;
}
module_init(drbd_trace_init);
static void __exit drbd_trace_exit(void)
{
if (trace_mask & TRACE_UNPLUG)
unregister_trace_drbd_unplug(probe_drbd_unplug);
if (trace_mask & TRACE_UUID)
unregister_trace_drbd_uuid(probe_drbd_uuid);
if (trace_mask & TRACE_EE)
unregister_trace_drbd_ee(probe_drbd_ee);
if (trace_mask & TRACE_PACKET)
unregister_trace_drbd_packet(probe_drbd_packet);
if (trace_mask & TRACE_MD_IO)
unregister_trace_drbd_md_io(probe_drbd_md_io);
if (trace_mask & TRACE_EPOCH)
unregister_trace_drbd_epoch(probe_drbd_epoch);
if (trace_mask & TRACE_NL)
unregister_trace_drbd_netlink(probe_drbd_netlink);
if (trace_mask & TRACE_AL_EXT)
unregister_trace_drbd_actlog(probe_drbd_actlog);
if (trace_mask & TRACE_RQ)
unregister_trace_drbd_bio(probe_drbd_bio);
if (trace_mask & TRACE_INT_RQ)
unregister_trace_drbd_req(probe_drbd_req);
if (trace_mask & TRACE_RESYNC)
unregister_trace__drbd_resync(probe_drbd_resync);
tracepoint_synchronize_unregister();
}
module_exit(drbd_trace_exit);
/*
drbd_tracing.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef DRBD_TRACING_H
#define DRBD_TRACING_H
#include <linux/tracepoint.h>
#include "drbd_int.h"
#include "drbd_req.h"
enum {
TRACE_LVL_ALWAYS = 0,
TRACE_LVL_SUMMARY,
TRACE_LVL_METRICS,
TRACE_LVL_ALL,
TRACE_LVL_MAX
};
DECLARE_TRACE(drbd_unplug,
TP_PROTO(struct drbd_conf *mdev, char* msg),
TP_ARGS(mdev, msg));
DECLARE_TRACE(drbd_uuid,
TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index),
TP_ARGS(mdev, index));
DECLARE_TRACE(drbd_ee,
TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg),
TP_ARGS(mdev, e, msg));
DECLARE_TRACE(drbd_md_io,
TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev),
TP_ARGS(mdev, rw, bdev));
DECLARE_TRACE(drbd_epoch,
TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev),
TP_ARGS(mdev, epoch, ev));
DECLARE_TRACE(drbd_netlink,
TP_PROTO(void *data, int is_req),
TP_ARGS(data, is_req));
DECLARE_TRACE(drbd_actlog,
TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg),
TP_ARGS(mdev, sector, msg));
DECLARE_TRACE(drbd_bio,
TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete,
struct drbd_request *r),
TP_ARGS(mdev, pfx, bio, complete, r));
DECLARE_TRACE(drbd_req,
TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg),
TP_ARGS(req, what, msg));
DECLARE_TRACE(drbd_packet,
TP_PROTO(struct drbd_conf *mdev, struct socket *sock,
int recv, union p_polymorph *p, char *file, int line),
TP_ARGS(mdev, sock, recv, p, file, line));
DECLARE_TRACE(_drbd_resync,
TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args),
TP_ARGS(mdev, level, fmt, args));
#endif
/*
-*- linux-c -*-
drbd_receiver.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef _DRBD_VLI_H
#define _DRBD_VLI_H
/*
* At a granularity of 4KiB storage represented per bit,
* and stroage sizes of several TiB,
* and possibly small-bandwidth replication,
* the bitmap transfer time can take much too long,
* if transmitted in plain text.
*
* We try to reduce the transfered bitmap information
* by encoding runlengths of bit polarity.
*
* We never actually need to encode a "zero" (runlengths are positive).
* But then we have to store the value of the first bit.
* The first bit of information thus shall encode if the first runlength
* gives the number of set or unset bits.
*
* We assume that large areas are either completely set or unset,
* which gives good compression with any runlength method,
* even when encoding the runlength as fixed size 32bit/64bit integers.
*
* Still, there may be areas where the polarity flips every few bits,
* and encoding the runlength sequence of those areas with fix size
* integers would be much worse than plaintext.
*
* We want to encode small runlength values with minimum code length,
* while still being able to encode a Huge run of all zeros.
*
* Thus we need a Variable Length Integer encoding, VLI.
*
* For some cases, we produce more code bits than plaintext input.
* We need to send incompressible chunks as plaintext, skip over them
* and then see if the next chunk compresses better.
*
* We don't care too much about "excellent" compression ratio for large
* runlengths (all set/all clear): whether we achieve a factor of 100
* or 1000 is not that much of an issue.
* We do not want to waste too much on short runlengths in the "noisy"
* parts of the bitmap, though.
*
* There are endless variants of VLI, we experimented with:
* * simple byte-based
* * various bit based with different code word length.
*
* To avoid yet an other configuration parameter (choice of bitmap compression
* algorithm) which was difficult to explain and tune, we just chose the one
* variant that turned out best in all test cases.
* Based on real world usage patterns, with device sizes ranging from a few GiB
* to several TiB, file server/mailserver/webserver/mysql/postgress,
* mostly idle to really busy, the all time winner (though sometimes only
* marginally better) is:
*/
/*
* encoding is "visualised" as
* __little endian__ bitstream, least significant bit first (left most)
*
* this particular encoding is chosen so that the prefix code
* starts as unary encoding the level, then modified so that
* 10 levels can be described in 8bit, with minimal overhead
* for the smaller levels.
*
* Number of data bits follow fibonacci sequence, with the exception of the
* last level (+1 data bit, so it makes 64bit total). The only worse code when
* encoding bit polarity runlength is 1 plain bits => 2 code bits.
prefix data bits max val Nº data bits
0 x 0x2 1
10 x 0x4 1
110 xx 0x8 2
1110 xxx 0x10 3
11110 xxx xx 0x30 5
111110 xx xxxxxx 0x130 8
11111100 xxxxxxxx xxxxx 0x2130 13
11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
* maximum encodable value: 0x100000400202130 == 2**56 + some */
/* compression "table":
transmitted x 0.29
as plaintext x ........................
x ........................
x ........................
x 0.59 0.21........................
x ........................................................
x .. c ...................................................
x 0.44.. o ...................................................
x .......... d ...................................................
x .......... e ...................................................
X............. ...................................................
x.............. b ...................................................
2.0x............... i ...................................................
#X................ t ...................................................
#................. s ........................... plain bits ..........
-+-----------------------------------------------------------------------
1 16 32 64
*/
/* LEVEL: (total bits, prefix bits, prefix value),
* sorted ascending by number of total bits.
* The rest of the code table is calculated at compiletime from this. */
/* fibonacci data 1, 1, ... */
#define VLI_L_1_1() do { \
LEVEL( 2, 1, 0x00); \
LEVEL( 3, 2, 0x01); \
LEVEL( 5, 3, 0x03); \
LEVEL( 7, 4, 0x07); \
LEVEL(10, 5, 0x0f); \
LEVEL(14, 6, 0x1f); \
LEVEL(21, 8, 0x3f); \
LEVEL(29, 8, 0x7f); \
LEVEL(42, 8, 0xbf); \
LEVEL(64, 8, 0xff); \
} while (0)
/* finds a suitable level to decode the least significant part of in.
* returns number of bits consumed.
*
* BUG() for bad input, as that would mean a buggy code table. */
static inline int vli_decode_bits(u64 *out, const u64 in)
{
u64 adj = 1;
#define LEVEL(t,b,v) \
do { \
if ((in & ((1 << b) -1)) == v) { \
*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
return t; \
} \
adj += 1ULL << (t - b); \
} while (0)
VLI_L_1_1();
/* NOT REACHED, if VLI_LEVELS code table is defined properly */
BUG();
#undef LEVEL
}
/* return number of code bits needed,
* or negative error number */
static inline int __vli_encode_bits(u64 *out, const u64 in)
{
u64 max = 0;
u64 adj = 1;
if (in == 0)
return -EINVAL;
#define LEVEL(t,b,v) do { \
max += 1ULL << (t - b); \
if (in <= max) { \
if (out) \
*out = ((in - adj) << b) | v; \
return t; \
} \
adj = max + 1; \
} while (0)
VLI_L_1_1();
return -EOVERFLOW;
#undef LEVEL
}
#undef VLI_L_1_1
/* code from here down is independend of actually used bit code */
/*
* Code length is determined by some unique (e.g. unary) prefix.
* This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
* not a byte stream.
*/
/* for the bitstream, we need a cursor */
struct bitstream_cursor {
/* the current byte */
u8 *b;
/* the current bit within *b, nomalized: 0..7 */
unsigned int bit;
};
/* initialize cursor to point to first bit of stream */
static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
{
cur->b = s;
cur->bit = 0;
}
/* advance cursor by that many bits; maximum expected input value: 64,
* but depending on VLI implementation, it may be more. */
static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
{
bits += cur->bit;
cur->b = cur->b + (bits >> 3);
cur->bit = bits & 7;
}
/* the bitstream itself knows its length */
struct bitstream {
struct bitstream_cursor cur;
unsigned char *buf;
size_t buf_len; /* in bytes */
/* for input stream:
* number of trailing 0 bits for padding
* total number of valid bits in stream: buf_len * 8 - pad_bits */
unsigned int pad_bits;
};
static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
{
bs->buf = s;
bs->buf_len = len;
bs->pad_bits = pad_bits;
bitstream_cursor_reset(&bs->cur, bs->buf);
}
static inline void bitstream_rewind(struct bitstream *bs)
{
bitstream_cursor_reset(&bs->cur, bs->buf);
memset(bs->buf, 0, bs->buf_len);
}
/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
* Ignores "pad_bits".
* Returns zero if bits == 0 (nothing to do).
* Returns number of bits used if successful.
*
* If there is not enough room left in bitstream,
* leaves bitstream unchanged and returns -ENOBUFS.
*/
static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
{
unsigned char *b = bs->cur.b;
unsigned int tmp;
if (bits == 0)
return 0;
if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
return -ENOBUFS;
/* paranoia: strip off hi bits; they should not be set anyways. */
if (bits < 64)
val &= ~0ULL >> (64 - bits);
*b++ |= (val & 0xff) << bs->cur.bit;
for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
*b++ |= (val >> tmp) & 0xff;
bitstream_cursor_advance(&bs->cur, bits);
return bits;
}
/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
*
* If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
*
* If there are less than the requested number of valid bits left in the
* bitstream, still fetches all available bits.
*
* Returns number of actually fetched bits.
*/
static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
{
u64 val;
unsigned int n;
if (bits > 64)
return -EINVAL;
if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
- bs->cur.bit - bs->pad_bits;
if (bits == 0) {
*out = 0;
return 0;
}
/* get the high bits */
val = 0;
n = (bs->cur.bit + bits + 7) >> 3;
/* n may be at most 9, if cur.bit + bits > 64 */
/* which means this copies at most 8 byte */
if (n) {
memcpy(&val, bs->cur.b+1, n - 1);
val = le64_to_cpu(val) << (8 - bs->cur.bit);
}
/* we still need the low bits */
val |= bs->cur.b[0] >> bs->cur.bit;
/* and mask out bits we don't want */
val &= ~0ULL >> (64 - bits);
bitstream_cursor_advance(&bs->cur, bits);
*out = val;
return bits;
}
/* encodes @in as vli into @bs;
* return values
* > 0: number of bits successfully stored in bitstream
* -ENOBUFS @bs is full
* -EINVAL input zero (invalid)
* -EOVERFLOW input too large for this vli code (invalid)
*/
static inline int vli_encode_bits(struct bitstream *bs, u64 in)
{
u64 code = code;
int bits = __vli_encode_bits(&code, in);
if (bits <= 0)
return bits;
return bitstream_put_bits(bs, code, bits);
}
#endif
/*
drbd_worker.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/autoconf.h>
#include <linux/module.h>
#include <linux/version.h>
#include <linux/drbd.h>
#include <linux/sched.h>
#include <linux/smp_lock.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/slab.h>
#include <linux/random.h>
#include <linux/mm.h>
#include <linux/string.h>
#include <linux/scatterlist.h>
#include "drbd_int.h"
#include "drbd_req.h"
#include "drbd_tracing.h"
#define SLEEP_TIME (HZ/10)
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
/* defined here:
drbd_md_io_complete
drbd_endio_write_sec
drbd_endio_read_sec
drbd_endio_pri
* more endio handlers:
atodb_endio in drbd_actlog.c
drbd_bm_async_io_complete in drbd_bitmap.c
* For all these callbacks, note the following:
* The callbacks will be called in irq context by the IDE drivers,
* and in Softirqs/Tasklets/BH context by the SCSI drivers.
* Try to get the locking right :)
*
*/
/* About the global_state_lock
Each state transition on an device holds a read lock. In case we have
to evaluate the sync after dependencies, we grab a write lock, because
we need stable states on all devices for that. */
rwlock_t global_state_lock;
/* used for synchronous meta data and bitmap IO
* submitted by drbd_md_sync_page_io()
*/
void drbd_md_io_complete(struct bio *bio, int error)
{
struct drbd_md_io *md_io;
md_io = (struct drbd_md_io *)bio->bi_private;
md_io->error = error;
trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL);
complete(&md_io->event);
}
/* reads on behalf of the partner,
* "submitted" by the receiver
*/
void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
{
unsigned long flags = 0;
struct drbd_epoch_entry *e = NULL;
struct drbd_conf *mdev;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
e = bio->bi_private;
mdev = e->mdev;
if (error)
dev_warn(DEV, "read: error=%d s=%llus\n", error,
(unsigned long long)e->sector);
if (!error && !uptodate) {
dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
(unsigned long long)e->sector);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
error = -EIO;
}
D_ASSERT(e->block_id != ID_VACANT);
trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
spin_lock_irqsave(&mdev->req_lock, flags);
mdev->read_cnt += e->size >> 9;
list_del(&e->w.list);
if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait);
spin_unlock_irqrestore(&mdev->req_lock, flags);
drbd_chk_io_error(mdev, error, FALSE);
drbd_queue_work(&mdev->data.work, &e->w);
put_ldev(mdev);
trace_drbd_ee(mdev, e, "read completed");
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver.
*/
void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
{
unsigned long flags = 0;
struct drbd_epoch_entry *e = NULL;
struct drbd_conf *mdev;
sector_t e_sector;
int do_wake;
int is_syncer_req;
int do_al_complete_io;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
e = bio->bi_private;
mdev = e->mdev;
if (error)
dev_warn(DEV, "write: error=%d s=%llus\n", error,
(unsigned long long)e->sector);
if (!error && !uptodate) {
dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
(unsigned long long)e->sector);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
error = -EIO;
}
/* error == -ENOTSUPP would be a better test,
* alas it is not reliable */
if (error && is_barrier && e->flags & EE_IS_BARRIER) {
drbd_bump_write_ordering(mdev, WO_bdev_flush);
spin_lock_irqsave(&mdev->req_lock, flags);
list_del(&e->w.list);
e->w.cb = w_e_reissue;
/* put_ldev actually happens below, once we come here again. */
__release(local);
spin_unlock_irqrestore(&mdev->req_lock, flags);
drbd_queue_work(&mdev->data.work, &e->w);
return;
}
D_ASSERT(e->block_id != ID_VACANT);
trace_drbd_bio(mdev, "Sec", bio, 1, NULL);
spin_lock_irqsave(&mdev->req_lock, flags);
mdev->writ_cnt += e->size >> 9;
is_syncer_req = is_syncer_block_id(e->block_id);
/* after we moved e to done_ee,
* we may no longer access it,
* it may be freed/reused already!
* (as soon as we release the req_lock) */
e_sector = e->sector;
do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
list_del(&e->w.list); /* has been on active_ee or sync_ee */
list_add_tail(&e->w.list, &mdev->done_ee);
trace_drbd_ee(mdev, e, "write completed");
/* No hlist_del_init(&e->colision) here, we did not send the Ack yet,
* neither did we wake possibly waiting conflicting requests.
* done from "drbd_process_done_ee" within the appropriate w.cb
* (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
do_wake = is_syncer_req
? list_empty(&mdev->sync_ee)
: list_empty(&mdev->active_ee);
if (error)
__drbd_chk_io_error(mdev, FALSE);
spin_unlock_irqrestore(&mdev->req_lock, flags);
if (is_syncer_req)
drbd_rs_complete_io(mdev, e_sector);
if (do_wake)
wake_up(&mdev->ee_wait);
if (do_al_complete_io)
drbd_al_complete_io(mdev, e_sector);
wake_asender(mdev);
put_ldev(mdev);
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/
void drbd_endio_pri(struct bio *bio, int error)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
if (error)
dev_warn(DEV, "p %s: error=%d\n",
bio_data_dir(bio) == WRITE ? "write" : "read", error);
if (!error && !uptodate) {
dev_warn(DEV, "p %s: setting error to -EIO\n",
bio_data_dir(bio) == WRITE ? "write" : "read");
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
error = -EIO;
}
trace_drbd_bio(mdev, "Pri", bio, 1, NULL);
/* to avoid recursion in __req_mod */
if (unlikely(error)) {
what = (bio_data_dir(bio) == WRITE)
? write_completed_with_error
: (bio_rw(bio) == READA)
? read_completed_with_error
: read_ahead_completed_with_error;
} else
what = completed_ok;
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
spin_lock_irqsave(&mdev->req_lock, flags);
__req_mod(req, what, &m);
spin_unlock_irqrestore(&mdev->req_lock, flags);
if (m.bio)
complete_master_bio(mdev, &m);
}
int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
/* NOTE: mdev->ldev can be NULL by the time we get here! */
/* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */
/* the only way this callback is scheduled is from _req_may_be_done,
* when it is done and had a local write error, see comments there */
drbd_req_free(req);
return TRUE;
}
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
/* We should not detach for read io-error,
* but try to WRITE the P_DATA_REPLY to the failed location,
* to give the disk the chance to relocate that block */
spin_lock_irq(&mdev->req_lock);
if (cancel ||
mdev->state.conn < C_CONNECTED ||
mdev->state.pdsk <= D_INCONSISTENT) {
_req_mod(req, send_canceled);
spin_unlock_irq(&mdev->req_lock);
dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n");
return 1;
}
spin_unlock_irq(&mdev->req_lock);
return w_send_read_req(mdev, w, 0);
}
int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
ERR_IF(cancel) return 1;
dev_err(DEV, "resync inactive, but callback triggered??\n");
return 1; /* Simply ignore this! */
}
void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
struct bio_vec *bvec;
int i;
desc.tfm = tfm;
desc.flags = 0;
sg_init_table(&sg, 1);
crypto_hash_init(&desc);
__bio_for_each_segment(bvec, bio, i, 0) {
sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
crypto_hash_update(&desc, &sg, sg.length);
}
crypto_hash_final(&desc, digest);
}
static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int digest_size;
void *digest;
int ok;
D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
return 1;
}
if (likely(drbd_bio_uptodate(e->private_bio))) {
digest_size = crypto_hash_digestsize(mdev->csums_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev,
e->sector,
e->size,
digest,
digest_size,
P_CSUM_RS_REQUEST);
kfree(digest);
} else {
dev_err(DEV, "kmalloc() of digest failed.\n");
ok = 0;
}
} else
ok = 1;
drbd_free_ee(mdev, e);
if (unlikely(!ok))
dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
return ok;
}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
{
struct drbd_epoch_entry *e;
if (!get_ldev(mdev))
return 0;
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
if (!e) {
put_ldev(mdev);
return 2;
}
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
e->private_bio->bi_end_io = drbd_endio_read_sec;
e->private_bio->bi_rw = READ;
e->w.cb = w_e_send_csum;
mdev->read_cnt += size >> 9;
drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
return 1;
}
void resync_timer_fn(unsigned long data)
{
unsigned long flags;
struct drbd_conf *mdev = (struct drbd_conf *) data;
int queue;
spin_lock_irqsave(&mdev->req_lock, flags);
if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
queue = 1;
if (mdev->state.conn == C_VERIFY_S)
mdev->resync_work.cb = w_make_ov_request;
else
mdev->resync_work.cb = w_make_resync_request;
} else {
queue = 0;
mdev->resync_work.cb = w_resync_inactive;
}
spin_unlock_irqrestore(&mdev->req_lock, flags);
/* harmless race: list_empty outside data.work.q_lock */
if (list_empty(&mdev->resync_work.list) && queue)
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
}
int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
{
unsigned long bit;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
int max_segment_size = queue_max_segment_size(mdev->rq_queue);
int number, i, size, pe, mx;
int align, queued, sndbuf;
if (unlikely(cancel))
return 1;
if (unlikely(mdev->state.conn < C_CONNECTED)) {
dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected");
return 0;
}
if (mdev->state.conn != C_SYNC_TARGET)
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));
if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
to continue resync with a broken disk makes no sense at
all */
dev_err(DEV, "Disk broke down during resync!\n");
mdev->resync_work.cb = w_resync_inactive;
return 1;
}
number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
pe = atomic_read(&mdev->rs_pending_cnt);
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket)
mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req);
else
mx = 1;
mutex_unlock(&mdev->data.mutex);
/* For resync rates >160MB/sec, allow more pending RS requests */
if (number > mx)
mx = number;
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
if ((pe + number) > mx) {
number = mx - pe;
}
for (i = 0; i < number; i++) {
/* Stop generating RS requests, when half of the send buffer is filled */
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket) {
queued = mdev->data.socket->sk->sk_wmem_queued;
sndbuf = mdev->data.socket->sk->sk_sndbuf;
} else {
queued = 1;
sndbuf = 0;
}
mutex_unlock(&mdev->data.mutex);
if (queued > sndbuf / 2)
goto requeue;
next_sector:
size = BM_BLOCK_SIZE;
bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo);
if (bit == -1UL) {
mdev->bm_resync_fo = drbd_bm_bits(mdev);
mdev->resync_work.cb = w_resync_inactive;
put_ldev(mdev);
return 1;
}
sector = BM_BIT_TO_SECT(bit);
if (drbd_try_rs_begin_io(mdev, sector)) {
mdev->bm_resync_fo = bit;
goto requeue;
}
mdev->bm_resync_fo = bit + 1;
if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) {
drbd_rs_complete_io(mdev, sector);
goto next_sector;
}
#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE
/* try to find some adjacent bits.
* we stop if we have already the maximum req size.
*
* Additionally always align bigger requests, in order to
* be prepared for all stripe sizes of software RAIDs.
*
* we _do_ care about the agreed-upon q->max_segment_size
* here, as splitting up the requests on the other side is more
* difficult. the consequence is, that on lvm and md and other
* "indirect" devices, this is dead code, since
* q->max_segment_size will be PAGE_SIZE.
*/
align = 1;
for (;;) {
if (size + BM_BLOCK_SIZE > max_segment_size)
break;
/* Be always aligned */
if (sector & ((1<<(align+3))-1))
break;
/* do not cross extent boundaries */
if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
break;
/* now, is it actually dirty, after all?
* caution, drbd_bm_test_bit is tri-state for some
* obscure reason; ( b == 0 ) would get the out-of-band
* only accidentally right because of the "oddly sized"
* adjustment below */
if (drbd_bm_test_bit(mdev, bit+1) != 1)
break;
bit++;
size += BM_BLOCK_SIZE;
if ((BM_BLOCK_SIZE << align) <= size)
align++;
i++;
}
/* if we merged some,
* reset the offset to start the next drbd_bm_find_next from */
if (size > BM_BLOCK_SIZE)
mdev->bm_resync_fo = bit + 1;
#endif
/* adjust very last sectors, in case we are oddly sized */
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
case 0: /* Disk failure*/
put_ldev(mdev);
return 0;
case 2: /* Allocation failed */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
goto requeue;
/* case 1: everything ok */
}
} else {
inc_rs_pending(mdev);
if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
sector, size, ID_SYNCER)) {
dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
dec_rs_pending(mdev);
put_ldev(mdev);
return 0;
}
}
}
if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) {
/* last syncer _request_ was sent,
* but the P_RS_DATA_REPLY not yet received. sync will end (and
* next sync group will resume), as soon as we receive the last
* resync data block, and the last bit is cleared.
* until then resync "work" is "inactive" ...
*/
mdev->resync_work.cb = w_resync_inactive;
put_ldev(mdev);
return 1;
}
requeue:
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
return 1;
}
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
int number, i, size;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
if (unlikely(cancel))
return 1;
if (unlikely(mdev->state.conn < C_CONNECTED)) {
dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected");
return 0;
}
number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
if (atomic_read(&mdev->rs_pending_cnt) > number)
goto requeue;
number -= atomic_read(&mdev->rs_pending_cnt);
sector = mdev->ov_position;
for (i = 0; i < number; i++) {
if (sector >= capacity) {
mdev->resync_work.cb = w_resync_inactive;
return 1;
}
size = BM_BLOCK_SIZE;
if (drbd_try_rs_begin_io(mdev, sector)) {
mdev->ov_position = sector;
goto requeue;
}
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
inc_rs_pending(mdev);
if (!drbd_send_ov_request(mdev, sector, size)) {
dec_rs_pending(mdev);
return 0;
}
sector += BM_SECT_PER_BIT;
}
mdev->ov_position = sector;
requeue:
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
return 1;
}
int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
kfree(w);
ov_oos_print(mdev);
drbd_resync_finished(mdev);
return 1;
}
static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
kfree(w);
drbd_resync_finished(mdev);
return 1;
}
int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
unsigned long n_oos;
union drbd_state os, ns;
struct drbd_work *w;
char *khelper_cmd = NULL;
/* Remove all elements from the resync LRU. Since future actions
* might set bits in the (main) bitmap, then the entries in the
* resync LRU would be wrong. */
if (drbd_rs_del_all(mdev)) {
/* In case this is not possible now, most probably because
* there are P_RS_DATA_REPLY Packets lingering on the worker's
* queue (or even the read operations for those packets
* is not finished by now). Retry in 100ms. */
drbd_kick_lo(mdev);
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(HZ / 10);
w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
if (w) {
w->cb = w_resync_finished;
drbd_queue_work(&mdev->data.work, w);
return 1;
}
dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
}
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
if (dt <= 0)
dt = 1;
db = mdev->rs_total;
dbdt = Bit2KB(db/dt);
mdev->rs_paused /= HZ;
if (!get_ldev(mdev))
goto out;
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
/* This protects us against multiple calls (that can happen in the presence
of application IO), and against connectivity loss just before we arrive here. */
if (os.conn <= C_CONNECTED)
goto out_unlock;
ns = os;
ns.conn = C_CONNECTED;
dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ?
"Online verify " : "Resync",
dt + mdev->rs_paused, mdev->rs_paused, dbdt);
n_oos = drbd_bm_total_weight(mdev);
if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
if (n_oos) {
dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n",
n_oos, Bit2KB(1));
khelper_cmd = "out-of-sync";
}
} else {
D_ASSERT((n_oos - mdev->rs_failed) == 0);
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
khelper_cmd = "after-resync-target";
if (mdev->csums_tfm && mdev->rs_total) {
const unsigned long s = mdev->rs_same_csum;
const unsigned long t = mdev->rs_total;
const int ratio =
(t == 0) ? 0 :
(t < 100000) ? ((s*100)/t) : (s/(t/100));
dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; "
"transferred %luK total %luK\n",
ratio,
Bit2KB(mdev->rs_same_csum),
Bit2KB(mdev->rs_total - mdev->rs_same_csum),
Bit2KB(mdev->rs_total));
}
}
if (mdev->rs_failed) {
dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed);
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
ns.disk = D_INCONSISTENT;
ns.pdsk = D_UP_TO_DATE;
} else {
ns.disk = D_UP_TO_DATE;
ns.pdsk = D_INCONSISTENT;
}
} else {
ns.disk = D_UP_TO_DATE;
ns.pdsk = D_UP_TO_DATE;
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
if (mdev->p_uuid) {
int i;
for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
_drbd_uuid_set(mdev, i, mdev->p_uuid[i]);
drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]);
_drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]);
} else {
dev_err(DEV, "mdev->p_uuid is NULL! BUG\n");
}
}
drbd_uuid_set_bm(mdev, 0UL);
if (mdev->p_uuid) {
/* Now the two UUID sets are equal, update what we
* know of the peer. */
int i;
for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
mdev->p_uuid[i] = mdev->ldev->md.uuid[i];
}
}
_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
out_unlock:
spin_unlock_irq(&mdev->req_lock);
put_ldev(mdev);
out:
mdev->rs_total = 0;
mdev->rs_failed = 0;
mdev->rs_paused = 0;
mdev->ov_start_sector = 0;
if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
}
if (khelper_cmd)
drbd_khelper(mdev, khelper_cmd);
return 1;
}
/* helper */
static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
{
if (drbd_bio_has_active_page(e->private_bio)) {
/* This might happen if sendpage() has not finished */
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->req_lock);
} else
drbd_free_ee(mdev, e);
}
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int ok;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
dec_unacked(mdev);
return 1;
}
if (likely(drbd_bio_uptodate(e->private_bio))) {
ok = drbd_send_block(mdev, P_DATA_REPLY, e);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
(unsigned long long)e->sector);
ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
}
dec_unacked(mdev);
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
dev_err(DEV, "drbd_send_block() failed\n");
return ok;
}
/**
* w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int ok;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
dec_unacked(mdev);
return 1;
}
if (get_ldev_if_state(mdev, D_FAILED)) {
drbd_rs_complete_io(mdev, e->sector);
put_ldev(mdev);
}
if (likely(drbd_bio_uptodate(e->private_bio))) {
if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(mdev);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Not sending RSDataReply, "
"partner DISKLESS!\n");
ok = 1;
}
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
(unsigned long long)e->sector);
ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
/* update resync data with failure */
drbd_rs_failed_io(mdev, e->sector, e->size);
}
dec_unacked(mdev);
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
dev_err(DEV, "drbd_send_block() failed\n");
return ok;
}
int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
struct digest_info *di;
int digest_size;
void *digest = NULL;
int ok, eq = 0;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
dec_unacked(mdev);
return 1;
}
drbd_rs_complete_io(mdev, e->sector);
di = (struct digest_info *)(unsigned long)e->block_id;
if (likely(drbd_bio_uptodate(e->private_bio))) {
/* quick hack to try to avoid a race against reconfiguration.
* a real fix would be much more involved,
* introducing more locking mechanisms */
if (mdev->csums_tfm) {
digest_size = crypto_hash_digestsize(mdev->csums_tfm);
D_ASSERT(digest_size == di->digest_size);
digest = kmalloc(digest_size, GFP_NOIO);
}
if (digest) {
drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
if (eq) {
drbd_set_in_sync(mdev, e->sector, e->size);
mdev->rs_same_csum++;
ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
} else {
inc_rs_pending(mdev);
e->block_id = ID_SYNCER;
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
}
} else {
ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
}
dec_unacked(mdev);
kfree(di);
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
dev_err(DEV, "drbd_send_block/ack() failed\n");
return ok;
}
int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
int digest_size;
void *digest;
int ok = 1;
if (unlikely(cancel))
goto out;
if (unlikely(!drbd_bio_uptodate(e->private_bio)))
goto out;
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
/* FIXME if this allocation fails, online verify will not terminate! */
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
digest, digest_size, P_OV_REPLY);
if (!ok)
dec_rs_pending(mdev);
kfree(digest);
}
out:
drbd_free_ee(mdev, e);
dec_unacked(mdev);
return ok;
}
void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
{
if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
mdev->ov_last_oos_size += size>>9;
} else {
mdev->ov_last_oos_start = sector;
mdev->ov_last_oos_size = size>>9;
}
drbd_set_out_of_sync(mdev, sector, size);
set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags);
}
int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
struct digest_info *di;
int digest_size;
void *digest;
int ok, eq = 0;
if (unlikely(cancel)) {
drbd_free_ee(mdev, e);
dec_unacked(mdev);
return 1;
}
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
drbd_rs_complete_io(mdev, e->sector);
di = (struct digest_info *)(unsigned long)e->block_id;
if (likely(drbd_bio_uptodate(e->private_bio))) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
D_ASSERT(digest_size == di->digest_size);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
} else {
ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
}
dec_unacked(mdev);
kfree(di);
if (!eq)
drbd_ov_oos_found(mdev, e->sector, e->size);
else
ov_oos_print(mdev);
ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size,
eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
drbd_free_ee(mdev, e);
if (--mdev->ov_left == 0) {
ov_oos_print(mdev);
drbd_resync_finished(mdev);
}
return ok;
}
int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
complete(&b->done);
return 1;
}
int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
struct p_barrier *p = &mdev->data.sbuf.barrier;
int ok = 1;
/* really avoid racing with tl_clear. w.cb may have been referenced
* just before it was reassigned and re-queued, so double check that.
* actually, this race was harmless, since we only try to send the
* barrier packet here, and otherwise do nothing with the object.
* but compare with the head of w_clear_epoch */
spin_lock_irq(&mdev->req_lock);
if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
cancel = 1;
spin_unlock_irq(&mdev->req_lock);
if (cancel)
return 1;
if (!drbd_get_data_sock(mdev))
return 0;
p->barrier = b->br_number;
/* inc_ap_pending was done where this was queued.
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in w_clear_epoch. */
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
(struct p_header *)p, sizeof(*p), 0);
drbd_put_data_sock(mdev);
return ok;
}
int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
if (cancel)
return 1;
return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
}
/**
* w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
int ok;
if (unlikely(cancel)) {
req_mod(req, send_canceled);
return 1;
}
ok = drbd_send_dblock(mdev, req);
req_mod(req, ok ? handed_over_to_network : send_failed);
return ok;
}
/**
* w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
int ok;
if (unlikely(cancel)) {
req_mod(req, send_canceled);
return 1;
}
ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
(unsigned long)req);
if (!ok) {
/* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
* so this is probably redundant */
if (mdev->state.conn >= C_CONNECTED)
drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
}
req_mod(req, ok ? handed_over_to_network : send_failed);
return ok;
}
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
while (1) {
if (odev->sync_conf.after == -1)
return 1;
odev = minor_to_mdev(odev->sync_conf.after);
ERR_IF(!odev) return 1;
if ((odev->state.conn >= C_SYNC_SOURCE &&
odev->state.conn <= C_PAUSED_SYNC_T) ||
odev->state.aftr_isp || odev->state.peer_isp ||
odev->state.user_isp)
return 0;
}
}
/**
* _drbd_pause_after() - Pause resync on all devices that may not resync now
* @mdev: DRBD device.
*
* Called from process context only (admin command and after_state_ch).
*/
static int _drbd_pause_after(struct drbd_conf *mdev)
{
struct drbd_conf *odev;
int i, rv = 0;
for (i = 0; i < minor_count; i++) {
odev = minor_to_mdev(i);
if (!odev)
continue;
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (!_drbd_may_sync_now(odev))
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
!= SS_NOTHING_TO_DO);
}
return rv;
}
/**
* _drbd_resume_next() - Resume resync on all devices that may resync now
* @mdev: DRBD device.
*
* Called from process context only (admin command and worker).
*/
static int _drbd_resume_next(struct drbd_conf *mdev)
{
struct drbd_conf *odev;
int i, rv = 0;
for (i = 0; i < minor_count; i++) {
odev = minor_to_mdev(i);
if (!odev)
continue;
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (odev->state.aftr_isp) {
if (_drbd_may_sync_now(odev))
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
CS_HARD, NULL)
!= SS_NOTHING_TO_DO) ;
}
}
return rv;
}
void resume_next_sg(struct drbd_conf *mdev)
{
write_lock_irq(&global_state_lock);
_drbd_resume_next(mdev);
write_unlock_irq(&global_state_lock);
}
void suspend_other_sg(struct drbd_conf *mdev)
{
write_lock_irq(&global_state_lock);
_drbd_pause_after(mdev);
write_unlock_irq(&global_state_lock);
}
static int sync_after_error(struct drbd_conf *mdev, int o_minor)
{
struct drbd_conf *odev;
if (o_minor == -1)
return NO_ERROR;
if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
return ERR_SYNC_AFTER;
/* check for loops */
odev = minor_to_mdev(o_minor);
while (1) {
if (odev == mdev)
return ERR_SYNC_AFTER_CYCLE;
/* dependency chain ends here, no cycles. */
if (odev->sync_conf.after == -1)
return NO_ERROR;
/* follow the dependency chain */
odev = minor_to_mdev(odev->sync_conf.after);
}
}
int drbd_alter_sa(struct drbd_conf *mdev, int na)
{
int changes;
int retcode;
write_lock_irq(&global_state_lock);
retcode = sync_after_error(mdev, na);
if (retcode == NO_ERROR) {
mdev->sync_conf.after = na;
do {
changes = _drbd_pause_after(mdev);
changes |= _drbd_resume_next(mdev);
} while (changes);
}
write_unlock_irq(&global_state_lock);
return retcode;
}
/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
* @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
*
* This function might bring you directly into one of the
* C_PAUSED_SYNC_* states.
*/
void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
{
union drbd_state ns;
int r;
if (mdev->state.conn >= C_SYNC_SOURCE) {
dev_err(DEV, "Resync already running!\n");
return;
}
trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n",
side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource");
/* In case a previous resync run was aborted by an IO error/detach on the peer. */
drbd_rs_cancel_all(mdev);
if (side == C_SYNC_TARGET) {
/* Since application IO was locked out during C_WF_BITMAP_T and
C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
we check that we might make the data inconsistent. */
r = drbd_khelper(mdev, "before-resync-target");
r = (r >> 8) & 0xff;
if (r > 0) {
dev_info(DEV, "before-resync-target handler returned %d, "
"dropping connection.\n", r);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return;
}
}
drbd_state_lock(mdev);
if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
drbd_state_unlock(mdev);
return;
}
if (side == C_SYNC_TARGET) {
mdev->bm_resync_fo = 0;
} else /* side == C_SYNC_SOURCE */ {
u64 uuid;
get_random_bytes(&uuid, sizeof(u64));
drbd_uuid_set(mdev, UI_BITMAP, uuid);
drbd_send_sync_uuid(mdev, uuid);
D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
}
write_lock_irq(&global_state_lock);
ns = mdev->state;
ns.aftr_isp = !_drbd_may_sync_now(mdev);
ns.conn = side;
if (side == C_SYNC_TARGET)
ns.disk = D_INCONSISTENT;
else /* side == C_SYNC_SOURCE */
ns.pdsk = D_INCONSISTENT;
r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
ns = mdev->state;
if (ns.conn < C_CONNECTED)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
mdev->rs_total =
mdev->rs_mark_left = drbd_bm_total_weight(mdev);
mdev->rs_failed = 0;
mdev->rs_paused = 0;
mdev->rs_start =
mdev->rs_mark_time = jiffies;
mdev->rs_same_csum = 0;
_drbd_pause_after(mdev);
}
write_unlock_irq(&global_state_lock);
drbd_state_unlock(mdev);
put_ldev(mdev);
if (r == SS_SUCCESS) {
dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
drbd_conn_str(ns.conn),
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);
if (mdev->rs_total == 0) {
/* Peer still reachable? Beware of failing before-resync-target handlers! */
request_ping(mdev);
__set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */
drbd_resync_finished(mdev);
return;
}
/* ns.conn may already be != mdev->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.
* No matter, that is handled in resync_timer_fn() */
if (ns.conn == C_SYNC_TARGET)
mod_timer(&mdev->resync_timer, jiffies);
drbd_md_sync(mdev);
}
}
int drbd_worker(struct drbd_thread *thi)
{
struct drbd_conf *mdev = thi->mdev;
struct drbd_work *w = NULL;
LIST_HEAD(work_list);
int intr = 0, i;
sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
while (get_t_state(thi) == Running) {
drbd_thread_current_set_cpu(mdev);
if (down_trylock(&mdev->data.work.s)) {
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket && !mdev->net_conf->no_cork)
drbd_tcp_uncork(mdev->data.socket);
mutex_unlock(&mdev->data.mutex);
intr = down_interruptible(&mdev->data.work.s);
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket && !mdev->net_conf->no_cork)
drbd_tcp_cork(mdev->data.socket);
mutex_unlock(&mdev->data.mutex);
}
if (intr) {
D_ASSERT(intr == -EINTR);
flush_signals(current);
ERR_IF (get_t_state(thi) == Running)
continue;
break;
}
if (get_t_state(thi) != Running)
break;
/* With this break, we have done a down() but not consumed
the entry from the list. The cleanup code takes care of
this... */
w = NULL;
spin_lock_irq(&mdev->data.work.q_lock);
ERR_IF(list_empty(&mdev->data.work.q)) {
/* something terribly wrong in our logic.
* we were able to down() the semaphore,
* but the list is empty... doh.
*
* what is the best thing to do now?
* try again from scratch, restarting the receiver,
* asender, whatnot? could break even more ugly,
* e.g. when we are primary, but no good local data.
*
* I'll try to get away just starting over this loop.
*/
spin_unlock_irq(&mdev->data.work.q_lock);
continue;
}
w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
list_del_init(&w->list);
spin_unlock_irq(&mdev->data.work.q_lock);
if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
/* dev_warn(DEV, "worker: a callback failed! \n"); */
if (mdev->state.conn >= C_CONNECTED)
drbd_force_state(mdev,
NS(conn, C_NETWORK_FAILURE));
}
}
D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags));
D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags));
spin_lock_irq(&mdev->data.work.q_lock);
i = 0;
while (!list_empty(&mdev->data.work.q)) {
list_splice_init(&mdev->data.work.q, &work_list);
spin_unlock_irq(&mdev->data.work.q_lock);
while (!list_empty(&work_list)) {
w = list_entry(work_list.next, struct drbd_work, list);
list_del_init(&w->list);
w->cb(mdev, w, 1);
i++; /* dead debugging code */
}
spin_lock_irq(&mdev->data.work.q_lock);
}
sema_init(&mdev->data.work.s, 0);
/* DANGEROUS race: if someone did queue his work within the spinlock,
* but up() ed outside the spinlock, we could get an up() on the
* semaphore without corresponding list entry.
* So don't do that.
*/
spin_unlock_irq(&mdev->data.work.q_lock);
D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
/* _drbd_set_state only uses stop_nowait.
* wait here for the Exiting receiver. */
drbd_thread_stop(&mdev->receiver);
drbd_mdev_cleanup(mdev);
dev_info(DEV, "worker terminated\n");
clear_bit(DEVICE_DYING, &mdev->flags);
clear_bit(CONFIG_PENDING, &mdev->flags);
wake_up(&mdev->state_wait);
return 0;
}
#ifndef _DRBD_WRAPPERS_H
#define _DRBD_WRAPPERS_H
#include <linux/ctype.h>
#include <linux/mm.h>
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
/* sets the number of 512 byte sectors of our virtual device */
static inline void drbd_set_my_capacity(struct drbd_conf *mdev,
sector_t size)
{
/* set_capacity(mdev->this_bdev->bd_disk, size); */
set_capacity(mdev->vdisk, size);
mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9;
}
#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
static inline int drbd_bio_has_active_page(struct bio *bio)
{
struct bio_vec *bvec;
int i;
__bio_for_each_segment(bvec, bio, i, 0) {
if (page_count(bvec->bv_page) > 1)
return 1;
}
return 0;
}
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
extern void drbd_endio_read_sec(struct bio *bio, int error);
extern void drbd_endio_write_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
/*
* used to submit our private bio
*/
static inline void drbd_generic_make_request(struct drbd_conf *mdev,
int fault_type, struct bio *bio)
{
__release(local);
if (!bio->bi_bdev) {
printk(KERN_ERR "drbd%d: drbd_generic_make_request: "
"bio->bi_bdev == NULL\n",
mdev_to_minor(mdev));
dump_stack();
bio_endio(bio, -ENODEV);
return;
}
if (FAULT_ACTIVE(mdev, fault_type))
bio_endio(bio, -EIO);
else
generic_make_request(bio);
}
static inline void drbd_plug_device(struct drbd_conf *mdev)
{
struct request_queue *q;
q = bdev_get_queue(mdev->this_bdev);
spin_lock_irq(q->queue_lock);
/* XXX the check on !blk_queue_plugged is redundant,
* implicitly checked in blk_plug_device */
if (!blk_queue_plugged(q)) {
blk_plug_device(q);
del_timer(&q->unplug_timer);
/* unplugging should not happen automatically... */
}
spin_unlock_irq(q->queue_lock);
}
static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
{
return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
== CRYPTO_ALG_TYPE_HASH;
}
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)
#endif
#endif
/*
drbd.h
Kernel module for 2.6.x Kernels
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2001-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2001-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef DRBD_H
#define DRBD_H
#include <linux/connector.h>
#include <asm/types.h>
#ifdef __KERNEL__
#include <linux/types.h>
#include <asm/byteorder.h>
#else
#include <sys/types.h>
#include <sys/wait.h>
#include <limits.h>
/* Altough the Linux source code makes a difference between
generic endianness and the bitfields' endianness, there is no
architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness
does not match the generic endianness. */
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define __LITTLE_ENDIAN_BITFIELD
#elif __BYTE_ORDER == __BIG_ENDIAN
#define __BIG_ENDIAN_BITFIELD
#else
# error "sorry, weird endianness on this box"
#endif
#endif
extern const char *drbd_buildtag(void);
#define REL_VERSION "8.3.3rc2"
#define API_VERSION 88
#define PRO_VERSION_MIN 86
#define PRO_VERSION_MAX 91
enum drbd_io_error_p {
EP_PASS_ON, /* FIXME should the better be named "Ignore"? */
EP_CALL_HELPER,
EP_DETACH
};
enum drbd_fencing_p {
FP_DONT_CARE,
FP_RESOURCE,
FP_STONITH
};
enum drbd_disconnect_p {
DP_RECONNECT,
DP_DROP_NET_CONF,
DP_FREEZE_IO
};
enum drbd_after_sb_p {
ASB_DISCONNECT,
ASB_DISCARD_YOUNGER_PRI,
ASB_DISCARD_OLDER_PRI,
ASB_DISCARD_ZERO_CHG,
ASB_DISCARD_LEAST_CHG,
ASB_DISCARD_LOCAL,
ASB_DISCARD_REMOTE,
ASB_CONSENSUS,
ASB_DISCARD_SECONDARY,
ASB_CALL_HELPER,
ASB_VIOLENTLY
};
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_codes {
ERR_CODE_BASE = 100,
NO_ERROR = 101,
ERR_LOCAL_ADDR = 102,
ERR_PEER_ADDR = 103,
ERR_OPEN_DISK = 104,
ERR_OPEN_MD_DISK = 105,
ERR_DISK_NOT_BDEV = 107,
ERR_MD_NOT_BDEV = 108,
ERR_DISK_TO_SMALL = 111,
ERR_MD_DISK_TO_SMALL = 112,
ERR_BDCLAIM_DISK = 114,
ERR_BDCLAIM_MD_DISK = 115,
ERR_MD_IDX_INVALID = 116,
ERR_IO_MD_DISK = 118,
ERR_MD_INVALID = 119,
ERR_AUTH_ALG = 120,
ERR_AUTH_ALG_ND = 121,
ERR_NOMEM = 122,
ERR_DISCARD = 123,
ERR_DISK_CONFIGURED = 124,
ERR_NET_CONFIGURED = 125,
ERR_MANDATORY_TAG = 126,
ERR_MINOR_INVALID = 127,
ERR_INTR = 129, /* EINTR */
ERR_RESIZE_RESYNC = 130,
ERR_NO_PRIMARY = 131,
ERR_SYNC_AFTER = 132,
ERR_SYNC_AFTER_CYCLE = 133,
ERR_PAUSE_IS_SET = 134,
ERR_PAUSE_IS_CLEAR = 135,
ERR_PACKET_NR = 137,
ERR_NO_DISK = 138,
ERR_NOT_PROTO_C = 139,
ERR_NOMEM_BITMAP = 140,
ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */
ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */
ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */
ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */
ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */
ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */
ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */
ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */
ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */
ERR_DATA_NOT_CURRENT = 150,
ERR_CONNECTED = 151, /* DRBD 8.3 only */
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
};
#define DRBD_PROT_A 1
#define DRBD_PROT_B 2
#define DRBD_PROT_C 3
enum drbd_role {
R_UNKNOWN = 0,
R_PRIMARY = 1, /* role */
R_SECONDARY = 2, /* role */
R_MASK = 3,
};
/* The order of these constants is important.
* The lower ones (<C_WF_REPORT_PARAMS) indicate
* that there is no socket!
* >=C_WF_REPORT_PARAMS ==> There is a socket
*/
enum drbd_conns {
C_STANDALONE,
C_DISCONNECTING, /* Temporal state on the way to StandAlone. */
C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */
/* These temporal states are all used on the way
* from >= C_CONNECTED to Unconnected.
* The 'disconnect reason' states
* I do not allow to change beween them. */
C_TIMEOUT,
C_BROKEN_PIPE,
C_NETWORK_FAILURE,
C_PROTOCOL_ERROR,
C_TEAR_DOWN,
C_WF_CONNECTION,
C_WF_REPORT_PARAMS, /* we have a socket */
C_CONNECTED, /* we have introduced each other */
C_STARTING_SYNC_S, /* starting full sync by admin request. */
C_STARTING_SYNC_T, /* stariing full sync by admin request. */
C_WF_BITMAP_S,
C_WF_BITMAP_T,
C_WF_SYNC_UUID,
/* All SyncStates are tested with this comparison
* xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */
C_SYNC_SOURCE,
C_SYNC_TARGET,
C_VERIFY_S,
C_VERIFY_T,
C_PAUSED_SYNC_S,
C_PAUSED_SYNC_T,
C_MASK = 31
};
enum drbd_disk_state {
D_DISKLESS,
D_ATTACHING, /* In the process of reading the meta-data */
D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */
/* when >= D_FAILED it is legal to access mdev->bc */
D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */
D_INCONSISTENT,
D_OUTDATED,
D_UNKNOWN, /* Only used for the peer, never for myself */
D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */
D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */
D_MASK = 15
};
union drbd_state {
/* According to gcc's docs is the ...
* The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1).
* Determined by ABI.
* pointed out by Maxim Uvarov q<muvarov@ru.mvista.com>
* even though we transmit as "cpu_to_be32(state)",
* the offsets of the bitfields still need to be swapped
* on different endianess.
*/
struct {
#if defined(__LITTLE_ENDIAN_BITFIELD)
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned conn:5 ; /* 17/32 cstates */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned peer_isp:1 ;
unsigned user_isp:1 ;
unsigned _pad:11; /* 0 unused */
#elif defined(__BIG_ENDIAN_BITFIELD)
unsigned _pad:11; /* 0 unused */
unsigned user_isp:1 ;
unsigned peer_isp:1 ;
unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
unsigned susp:1 ; /* 2/2 IO suspended no/yes */
unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
unsigned conn:5 ; /* 17/32 cstates */
unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
unsigned role:2 ; /* 3/4 primary/secondary/unknown */
#else
# error "this endianess is not supported"
#endif
};
unsigned int i;
};
enum drbd_state_ret_codes {
SS_CW_NO_NEED = 4,
SS_CW_SUCCESS = 3,
SS_NOTHING_TO_DO = 2,
SS_SUCCESS = 1,
SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */
SS_TWO_PRIMARIES = -1,
SS_NO_UP_TO_DATE_DISK = -2,
SS_NO_LOCAL_DISK = -4,
SS_NO_REMOTE_DISK = -5,
SS_CONNECTED_OUTDATES = -6,
SS_PRIMARY_NOP = -7,
SS_RESYNC_RUNNING = -8,
SS_ALREADY_STANDALONE = -9,
SS_CW_FAILED_BY_PEER = -10,
SS_IS_DISKLESS = -11,
SS_DEVICE_IN_USE = -12,
SS_NO_NET_CONFIG = -13,
SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */
SS_NEED_CONNECTION = -15, /* drbd-8.2 only */
SS_LOWER_THAN_OUTDATED = -16,
SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
};
/* from drbd_strings.c */
extern const char *drbd_conn_str(enum drbd_conns);
extern const char *drbd_role_str(enum drbd_role);
extern const char *drbd_disk_str(enum drbd_disk_state);
extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes);
#define SHARED_SECRET_MAX 64
#define MDF_CONSISTENT (1 << 0)
#define MDF_PRIMARY_IND (1 << 1)
#define MDF_CONNECTED_IND (1 << 2)
#define MDF_FULL_SYNC (1 << 3)
#define MDF_WAS_UP_TO_DATE (1 << 4)
#define MDF_PEER_OUT_DATED (1 << 5)
#define MDF_CRASHED_PRIMARY (1 << 6)
enum drbd_uuid_index {
UI_CURRENT,
UI_BITMAP,
UI_HISTORY_START,
UI_HISTORY_END,
UI_SIZE, /* nl-packet: number of dirty bits */
UI_FLAGS, /* nl-packet: flags */
UI_EXTENDED_SIZE /* Everything. */
};
enum drbd_timeout_flag {
UT_DEFAULT = 0,
UT_DEGRADED = 1,
UT_PEER_OUTDATED = 2,
};
#define UUID_JUST_CREATED ((__u64)4)
#define DRBD_MAGIC 0x83740267
#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2
#define DRBD_MD_INDEX_FLEX_INT -3
/* Start of the new netlink/connector stuff */
#define DRBD_NL_CREATE_DEVICE 0x01
#define DRBD_NL_SET_DEFAULTS 0x02
/* The following line should be moved over to linux/connector.h
* when the time comes */
#ifndef CN_IDX_DRBD
# define CN_IDX_DRBD 0x4
/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */
#endif
#define CN_VAL_DRBD 0x1
/* For searching a vacant cn_idx value */
#define CN_IDX_STEP 6977
struct drbd_nl_cfg_req {
int packet_type;
unsigned int drbd_minor;
int flags;
unsigned short tag_list[];
};
struct drbd_nl_cfg_reply {
int packet_type;
unsigned int minor;
int ret_code; /* enum ret_code or set_st_err_t */
unsigned short tag_list[]; /* only used with get_* calls */
};
#endif
/*
drbd_limits.h
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
*/
/*
* Our current limitations.
* Some of them are hard limits,
* some of them are arbitrary range limits, that make it easier to provide
* feedback about nonsense settings for certain configurable values.
*/
#ifndef DRBD_LIMITS_H
#define DRBD_LIMITS_H 1
#define DEBUG_RANGE_CHECK 0
#define DRBD_MINOR_COUNT_MIN 1
#define DRBD_MINOR_COUNT_MAX 255
#define DRBD_DIALOG_REFRESH_MIN 0
#define DRBD_DIALOG_REFRESH_MAX 600
/* valid port number */
#define DRBD_PORT_MIN 1
#define DRBD_PORT_MAX 0xffff
/* startup { */
/* if you want more than 3.4 days, disable */
#define DRBD_WFC_TIMEOUT_MIN 0
#define DRBD_WFC_TIMEOUT_MAX 300000
#define DRBD_WFC_TIMEOUT_DEF 0
#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
/* }*/
/* net { */
/* timeout, unit centi seconds
* more than one minute timeout is not usefull */
#define DRBD_TIMEOUT_MIN 1
#define DRBD_TIMEOUT_MAX 600
#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
/* active connection retries when C_WF_CONNECTION */
#define DRBD_CONNECT_INT_MIN 1
#define DRBD_CONNECT_INT_MAX 120
#define DRBD_CONNECT_INT_DEF 10 /* seconds */
/* keep-alive probes when idle */
#define DRBD_PING_INT_MIN 1
#define DRBD_PING_INT_MAX 120
#define DRBD_PING_INT_DEF 10
/* timeout for the ping packets.*/
#define DRBD_PING_TIMEO_MIN 1
#define DRBD_PING_TIMEO_MAX 100
#define DRBD_PING_TIMEO_DEF 5
/* max number of write requests between write barriers */
#define DRBD_MAX_EPOCH_SIZE_MIN 1
#define DRBD_MAX_EPOCH_SIZE_MAX 20000
#define DRBD_MAX_EPOCH_SIZE_DEF 2048
/* I don't think that a tcp send buffer of more than 10M is usefull */
#define DRBD_SNDBUF_SIZE_MIN 0
#define DRBD_SNDBUF_SIZE_MAX (10<<20)
#define DRBD_SNDBUF_SIZE_DEF (2*65535)
#define DRBD_RCVBUF_SIZE_MIN 0
#define DRBD_RCVBUF_SIZE_MAX (10<<20)
#define DRBD_RCVBUF_SIZE_DEF (2*65535)
/* @4k PageSize -> 128kB - 512MB */
#define DRBD_MAX_BUFFERS_MIN 32
#define DRBD_MAX_BUFFERS_MAX 131072
#define DRBD_MAX_BUFFERS_DEF 2048
/* @4k PageSize -> 4kB - 512MB */
#define DRBD_UNPLUG_WATERMARK_MIN 1
#define DRBD_UNPLUG_WATERMARK_MAX 131072
#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
/* 0 is disabled.
* 200 should be more than enough even for very short timeouts */
#define DRBD_KO_COUNT_MIN 0
#define DRBD_KO_COUNT_MAX 200
#define DRBD_KO_COUNT_DEF 0
/* } */
/* syncer { */
/* FIXME allow rate to be zero? */
#define DRBD_RATE_MIN 1
/* channel bonding 10 GbE, or other hardware */
#define DRBD_RATE_MAX (4 << 20)
#define DRBD_RATE_DEF 250 /* kb/second */
/* less than 7 would hit performance unneccessarily.
* 3833 is the largest prime that still does fit
* into 64 sectors of activity log */
#define DRBD_AL_EXTENTS_MIN 7
#define DRBD_AL_EXTENTS_MAX 3833
#define DRBD_AL_EXTENTS_DEF 127
#define DRBD_AFTER_MIN -1
#define DRBD_AFTER_MAX 255
#define DRBD_AFTER_DEF -1
/* } */
/* drbdsetup XY resize -d Z
* you are free to reduce the device size to nothing, if you want to.
* the upper limit with 64bit kernel, enough ram and flexible meta data
* is 16 TB, currently. */
/* DRBD_MAX_SECTORS */
#define DRBD_DISK_SIZE_SECT_MIN 0
#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30))
#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
#define DRBD_FENCING_DEF FP_DONT_CARE
#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
#define DRBD_MAX_BIO_BVECS_MIN 0
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
#undef RANGE
#endif
/*
PAKET( name,
TYPE ( pn, pr, member )
...
)
You may never reissue one of the pn arguments
*/
#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
#endif
NL_PACKET(primary, 1,
NL_BIT( 1, T_MAY_IGNORE, overwrite_peer)
)
NL_PACKET(secondary, 2, )
NL_PACKET(disk_conf, 3,
NL_INT64( 2, T_MAY_IGNORE, disk_size)
NL_STRING( 3, T_MANDATORY, backing_dev, 128)
NL_STRING( 4, T_MANDATORY, meta_dev, 128)
NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
NL_INTEGER( 7, T_MAY_IGNORE, fencing)
NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
/* 55 max_bio_size was available in 8.2.6rc2 */
NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
)
NL_PACKET(detach, 4, )
NL_PACKET(net_conf, 5,
NL_STRING( 8, T_MANDATORY, my_addr, 128)
NL_STRING( 9, T_MANDATORY, peer_addr, 128)
NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
NL_INTEGER( 14, T_MAY_IGNORE, timeout)
NL_INTEGER( 15, T_MANDATORY, wire_protocol)
NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
/* 59 addr_family was available in GIT, never released */
NL_BIT( 60, T_MANDATORY, mind_af)
NL_BIT( 27, T_MAY_IGNORE, want_lose)
NL_BIT( 28, T_MAY_IGNORE, two_primaries)
NL_BIT( 41, T_MAY_IGNORE, always_asbp)
NL_BIT( 61, T_MAY_IGNORE, no_cork)
NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
)
NL_PACKET(disconnect, 6, )
NL_PACKET(resize, 7,
NL_INT64( 29, T_MAY_IGNORE, resize_size)
)
NL_PACKET(syncer_conf, 8,
NL_INTEGER( 30, T_MAY_IGNORE, rate)
NL_INTEGER( 31, T_MAY_IGNORE, after)
NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
NL_BIT( 65, T_MAY_IGNORE, use_rle)
)
NL_PACKET(invalidate, 9, )
NL_PACKET(invalidate_peer, 10, )
NL_PACKET(pause_sync, 11, )
NL_PACKET(resume_sync, 12, )
NL_PACKET(suspend_io, 13, )
NL_PACKET(resume_io, 14, )
NL_PACKET(outdate, 15, )
NL_PACKET(get_config, 16, )
NL_PACKET(get_state, 17,
NL_INTEGER( 33, T_MAY_IGNORE, state_i)
)
NL_PACKET(get_uuids, 18,
NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
)
NL_PACKET(get_timeout_flag, 19,
NL_BIT( 36, T_MAY_IGNORE, use_degraded)
)
NL_PACKET(call_helper, 20,
NL_STRING( 38, T_MAY_IGNORE, helper, 32)
)
/* Tag nr 42 already allocated in drbd-8.1 development. */
NL_PACKET(sync_progress, 23,
NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
)
NL_PACKET(dump_ee, 24,
NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
NL_INT64( 48, T_MAY_IGNORE, ee_sector)
NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
)
NL_PACKET(start_ov, 25,
NL_INT64( 66, T_MAY_IGNORE, start_sector)
)
NL_PACKET(new_c_uuid, 26,
NL_BIT( 63, T_MANDATORY, clear_bm)
)
#undef NL_PACKET
#undef NL_INTEGER
#undef NL_INT64
#undef NL_BIT
#undef NL_STRING
#ifndef DRBD_TAG_MAGIC_H
#define DRBD_TAG_MAGIC_H
#define TT_END 0
#define TT_REMOVED 0xE000
/* declare packet_type enums */
enum packet_types {
#define NL_PACKET(name, number, fields) P_ ## name = number,
#define NL_INTEGER(pn, pr, member)
#define NL_INT64(pn, pr, member)
#define NL_BIT(pn, pr, member)
#define NL_STRING(pn, pr, member, len)
#include "drbd_nl.h"
P_nl_after_last_packet,
};
/* These struct are used to deduce the size of the tag lists: */
#define NL_PACKET(name, number, fields) \
struct name ## _tag_len_struct { fields };
#define NL_INTEGER(pn, pr, member) \
int member; int tag_and_len ## member;
#define NL_INT64(pn, pr, member) \
__u64 member; int tag_and_len ## member;
#define NL_BIT(pn, pr, member) \
unsigned char member:1; int tag_and_len ## member;
#define NL_STRING(pn, pr, member, len) \
unsigned char member[len]; int member ## _len; \
int tag_and_len ## member;
#include "linux/drbd_nl.h"
/* declate tag-list-sizes */
static const int tag_list_sizes[] = {
#define NL_PACKET(name, number, fields) 2 fields ,
#define NL_INTEGER(pn, pr, member) + 4 + 4
#define NL_INT64(pn, pr, member) + 4 + 8
#define NL_BIT(pn, pr, member) + 4 + 1
#define NL_STRING(pn, pr, member, len) + 4 + (len)
#include "drbd_nl.h"
};
/* The two highest bits are used for the tag type */
#define TT_MASK 0xC000
#define TT_INTEGER 0x0000
#define TT_INT64 0x4000
#define TT_BIT 0x8000
#define TT_STRING 0xC000
/* The next bit indicates if processing of the tag is mandatory */
#define T_MANDATORY 0x2000
#define T_MAY_IGNORE 0x0000
#define TN_MASK 0x1fff
/* The remaining 13 bits are used to enumerate the tags */
#define tag_type(T) ((T) & TT_MASK)
#define tag_number(T) ((T) & TN_MASK)
/* declare tag enums */
#define NL_PACKET(name, number, fields) fields
enum drbd_tags {
#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
#include "drbd_nl.h"
};
struct tag {
const char *name;
int type_n_flags;
int max_len;
};
/* declare tag names */
#define NL_PACKET(name, number, fields) fields
static const struct tag tag_descriptions[] = {
#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
#include "drbd_nl.h"
};
#endif
/*
lru_cache.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#ifndef LRU_CACHE_H
#define LRU_CACHE_H
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/bitops.h>
#include <linux/string.h> /* for memset */
#include <linux/seq_file.h>
/*
This header file (and its .c file; kernel-doc of functions see there)
define a helper framework to easily keep track of index:label associations,
and changes to an "active set" of objects, as well as pending transactions,
to persistently record those changes.
We use an LRU policy if it is necessary to "cool down" a region currently in
the active set before we can "heat" a previously unused region.
Because of this later property, it is called "lru_cache".
As it actually Tracks Objects in an Active SeT, we could also call it
toast (incidentally that is what may happen to the data on the
backend storage uppon next resync, if we don't get it right).
What for?
We replicate IO (more or less synchronously) to local and remote disk.
For crash recovery after replication node failure,
we need to resync all regions that have been target of in-flight WRITE IO
(in use, or "hot", regions), as we don't know wether or not those WRITEs have
made it to stable storage.
To avoid a "full resync", we need to persistently track these regions.
This is known as "write intent log", and can be implemented as on-disk
(coarse or fine grained) bitmap, or other meta data.
To avoid the overhead of frequent extra writes to this meta data area,
usually the condition is softened to regions that _may_ have been target of
in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent
bitmap, trading frequency of meta data transactions against amount of
(possibly unneccessary) resync traffic.
If we set a hard limit on the area that may be "hot" at any given time, we
limit the amount of resync traffic needed for crash recovery.
For recovery after replication link failure,
we need to resync all blocks that have been changed on the other replica
in the mean time, or, if both replica have been changed independently [*],
all blocks that have been changed on either replica in the mean time.
[*] usually as a result of a cluster split-brain and insufficient protection.
but there are valid use cases to do this on purpose.
Tracking those blocks can be implemented as "dirty bitmap".
Having it fine-grained reduces the amount of resync traffic.
It should also be persistent, to allow for reboots (or crashes)
while the replication link is down.
There are various possible implementations for persistently storing
write intent log information, three of which are mentioned here.
"Chunk dirtying"
The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well.
To reduce the frequency of bitmap updates for write-intent log purposes,
one could dirty "chunks" (of some size) at a time of the (fine grained)
on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as
possible, flushing it to disk again when a previously "hot" (and on-disk
dirtied as full chunk) area "cools down" again (no IO in flight anymore,
and none expected in the near future either).
"Explicit (coarse) write intent bitmap"
An other implementation could chose a (probably coarse) explicit bitmap,
for write-intent log purposes, additionally to the fine grained dirty bitmap.
"Activity log"
Yet an other implementation may keep track of the hot regions, by starting
with an empty set, and writing down a journal of region numbers that have
become "hot", or have "cooled down" again.
To be able to use a ring buffer for this journal of changes to the active
set, we not only record the actual changes to that set, but also record the
not changing members of the set in a round robin fashion. To do so, we use a
fixed (but configurable) number of slots which we can identify by index, and
associate region numbers (labels) with these indices.
For each transaction recording a change to the active set, we record the
change itself (index: -old_label, +new_label), and which index is associated
with which label (index: current_label) within a certain sliding window that
is moved further over the available indices with each such transaction.
Thus, for crash recovery, if the ringbuffer is sufficiently large, we can
accurately reconstruct the active set.
Sufficiently large depends only on maximum number of active objects, and the
size of the sliding window recording "index: current_label" associations within
each transaction.
This is what we call the "activity log".
Currently we need one activity log transaction per single label change, which
does not give much benefit over the "dirty chunks of bitmap" approach, other
than potentially less seeks.
We plan to change the transaction format to support multiple changes per
transaction, which then would reduce several (disjoint, "random") updates to
the bitmap into one transaction to the activity log ring buffer.
*/
/* this defines an element in a tracked set
* .colision is for hash table lookup.
* When we process a new IO request, we know its sector, thus can deduce the
* region number (label) easily. To do the label -> object lookup without a
* full list walk, we use a simple hash table.
*
* .list is on one of three lists:
* in_use: currently in use (refcnt > 0, lc_number != LC_FREE)
* lru: unused but ready to be reused or recycled
* (ts_refcnt == 0, lc_number != LC_FREE),
* free: unused but ready to be recycled
* (ts_refcnt == 0, lc_number == LC_FREE),
*
* an element is said to be "in the active set",
* if either on "in_use" or "lru", i.e. lc_number != LC_FREE.
*
* DRBD currently (May 2009) only uses 61 elements on the resync lru_cache
* (total memory usage 2 pages), and up to 3833 elements on the act_log
* lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages.
*
* We usually do not actually free these objects again, but only "recycle"
* them, as the change "index: -old_label, +LC_FREE" would need a transaction
* as well. Which also means that using a kmem_cache to allocate the objects
* from wastes some resources.
* But it avoids high order page allocations in kmalloc.
*/
struct lc_element {
struct hlist_node colision;
struct list_head list; /* LRU list or free list */
unsigned refcnt;
/* back "pointer" into ts_cache->element[index],
* for paranoia, and for "ts_element_to_index" */
unsigned lc_index;
/* if we want to track a larger set of objects,
* it needs to become arch independend u64 */
unsigned lc_number;
/* special label when on free list */
#define LC_FREE (~0U)
};
struct lru_cache {
/* the least recently used item is kept at lru->prev */
struct list_head lru;
struct list_head free;
struct list_head in_use;
/* the pre-created kmem cache to allocate the objects from */
struct kmem_cache *lc_cache;
/* size of tracked objects, used to memset(,0,) them in lc_reset */
size_t element_size;
/* offset of struct lc_element member in the tracked object */
size_t element_off;
/* number of elements (indices) */
unsigned int nr_elements;
/* Arbitrary limit on maximum tracked objects. Practical limit is much
* lower due to allocation failures, probably. For typical use cases,
* nr_elements should be a few thousand at most.
* This also limits the maximum value of ts_element.ts_index, allowing the
* 8 high bits of .ts_index to be overloaded with flags in the future. */
#define LC_MAX_ACTIVE (1<<24)
/* statistics */
unsigned used; /* number of lelements currently on in_use list */
unsigned long hits, misses, starving, dirty, changed;
/* see below: flag-bits for lru_cache */
unsigned long flags;
/* when changing the label of an index element */
unsigned int new_number;
/* for paranoia when changing the label of an index element */
struct lc_element *changing_element;
void *lc_private;
const char *name;
/* nr_elements there */
struct hlist_head *lc_slot;
struct lc_element **lc_element;
};
/* flag-bits for lru_cache */
enum {
/* debugging aid, to catch concurrent access early.
* user needs to guarantee exclusive access by proper locking! */
__LC_PARANOIA,
/* if we need to change the set, but currently there is a changing
* transaction pending, we are "dirty", and must deferr further
* changing requests */
__LC_DIRTY,
/* if we need to change the set, but currently there is no free nor
* unused element available, we are "starving", and must not give out
* further references, to guarantee that eventually some refcnt will
* drop to zero and we will be able to make progress again, changing
* the set, writing the transaction.
* if the statistics say we are frequently starving,
* nr_elements is too small. */
__LC_STARVING,
};
#define LC_PARANOIA (1<<__LC_PARANOIA)
#define LC_DIRTY (1<<__LC_DIRTY)
#define LC_STARVING (1<<__LC_STARVING)
extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned e_count, size_t e_size, size_t e_off);
extern void lc_reset(struct lru_cache *lc);
extern void lc_destroy(struct lru_cache *lc);
extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
extern void lc_del(struct lru_cache *lc, struct lc_element *element);
extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
struct seq_file;
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
void (*detail) (struct seq_file *, struct lc_element *));
/**
* lc_try_lock - can be used to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Note that the reference counts and order on the active and lru lists may
* still change. Returns true if we aquired the lock.
*/
static inline int lc_try_lock(struct lru_cache *lc)
{
return !test_and_set_bit(__LC_DIRTY, &lc->flags);
}
/**
* lc_unlock - unlock @lc, allow lc_get() to change the set again
* @lc: the lru cache to operate on
*/
static inline void lc_unlock(struct lru_cache *lc)
{
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
}
static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e = lc_find(lc, enr);
return e && e->refcnt;
}
#define lc_entry(ptr, type, member) \
container_of(ptr, type, member)
extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
#endif
...@@ -200,4 +200,7 @@ config NLATTR ...@@ -200,4 +200,7 @@ config NLATTR
config GENERIC_ATOMIC64 config GENERIC_ATOMIC64
bool bool
config LRU_CACHE
tristate
endmenu endmenu
...@@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o ...@@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
obj-$(CONFIG_NLATTR) += nlattr.o obj-$(CONFIG_NLATTR) += nlattr.o
obj-$(CONFIG_LRU_CACHE) += lru_cache.o
obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
obj-$(CONFIG_GENERIC_CSUM) += checksum.o obj-$(CONFIG_GENERIC_CSUM) += checksum.o
......
/*
lru_cache.c
This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
drbd is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
drbd is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with drbd; see the file COPYING. If not, write to
the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/module.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/string.h> /* for memset */
#include <linux/seq_file.h> /* for seq_printf */
#include <linux/lru_cache.h>
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
MODULE_DESCRIPTION("lru_cache - Track sets of hot objects");
MODULE_LICENSE("GPL");
/* this is developers aid only.
* it catches concurrent access (lack of locking on the users part) */
#define PARANOIA_ENTRY() do { \
BUG_ON(!lc); \
BUG_ON(!lc->nr_elements); \
BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \
} while (0)
#define RETURN(x...) do { \
clear_bit(__LC_PARANOIA, &lc->flags); \
smp_mb__after_clear_bit(); return x ; } while (0)
/* BUG() if e is not one of the elements tracked by lc */
#define PARANOIA_LC_ELEMENT(lc, e) do { \
struct lru_cache *lc_ = (lc); \
struct lc_element *e_ = (e); \
unsigned i = e_->lc_index; \
BUG_ON(i >= lc_->nr_elements); \
BUG_ON(lc_->lc_element[i] != e_); } while (0)
/**
* lc_create - prepares to track objects in an active set
* @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
* @e_count: number of elements allowed to be active simultaneously
* @e_size: size of the tracked objects
* @e_off: offset to the &struct lc_element member in a tracked object
*
* Returns a pointer to a newly initialized struct lru_cache on success,
* or NULL on (allocation) failure.
*/
struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned e_count, size_t e_size, size_t e_off)
{
struct hlist_head *slot = NULL;
struct lc_element **element = NULL;
struct lru_cache *lc;
struct lc_element *e;
unsigned cache_obj_size = kmem_cache_size(cache);
unsigned i;
WARN_ON(cache_obj_size < e_size);
if (cache_obj_size < e_size)
return NULL;
/* e_count too big; would probably fail the allocation below anyways.
* for typical use cases, e_count should be few thousand at most. */
if (e_count > LC_MAX_ACTIVE)
return NULL;
slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL);
if (!slot)
goto out_fail;
element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL);
if (!element)
goto out_fail;
lc = kzalloc(sizeof(*lc), GFP_KERNEL);
if (!lc)
goto out_fail;
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
lc->name = name;
lc->element_size = e_size;
lc->element_off = e_off;
lc->nr_elements = e_count;
lc->new_number = LC_FREE;
lc->lc_cache = cache;
lc->lc_element = element;
lc->lc_slot = slot;
/* preallocate all objects */
for (i = 0; i < e_count; i++) {
void *p = kmem_cache_alloc(cache, GFP_KERNEL);
if (!p)
break;
memset(p, 0, lc->element_size);
e = p + e_off;
e->lc_index = i;
e->lc_number = LC_FREE;
list_add(&e->list, &lc->free);
element[i] = e;
}
if (i == e_count)
return lc;
/* else: could not allocate all elements, give up */
for (i--; i; i--) {
void *p = element[i];
kmem_cache_free(cache, p - e_off);
}
kfree(lc);
out_fail:
kfree(element);
kfree(slot);
return NULL;
}
void lc_free_by_index(struct lru_cache *lc, unsigned i)
{
void *p = lc->lc_element[i];
WARN_ON(!p);
if (p) {
p -= lc->element_off;
kmem_cache_free(lc->lc_cache, p);
}
}
/**
* lc_destroy - frees memory allocated by lc_create()
* @lc: the lru cache to destroy
*/
void lc_destroy(struct lru_cache *lc)
{
unsigned i;
if (!lc)
return;
for (i = 0; i < lc->nr_elements; i++)
lc_free_by_index(lc, i);
kfree(lc->lc_element);
kfree(lc->lc_slot);
kfree(lc);
}
/**
* lc_reset - does a full reset for @lc and the hash table slots.
* @lc: the lru cache to operate on
*
* It is roughly the equivalent of re-allocating a fresh lru_cache object,
* basically a short cut to lc_destroy(lc); lc = lc_create(...);
*/
void lc_reset(struct lru_cache *lc)
{
unsigned i;
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
lc->used = 0;
lc->hits = 0;
lc->misses = 0;
lc->starving = 0;
lc->dirty = 0;
lc->changed = 0;
lc->flags = 0;
lc->changing_element = NULL;
lc->new_number = LC_FREE;
memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
for (i = 0; i < lc->nr_elements; i++) {
struct lc_element *e = lc->lc_element[i];
void *p = e;
p -= lc->element_off;
memset(p, 0, lc->element_size);
/* re-init it */
e->lc_index = i;
e->lc_number = LC_FREE;
list_add(&e->list, &lc->free);
}
}
/**
* lc_seq_printf_stats - print stats about @lc into @seq
* @seq: the seq_file to print into
* @lc: the lru cache to print statistics of
*/
size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc)
{
/* NOTE:
* total calls to lc_get are
* (starving + hits + misses)
* misses include "dirty" count (update from an other thread in
* progress) and "changed", when this in fact lead to an successful
* update of the cache.
*/
return seq_printf(seq, "\t%s: used:%u/%u "
"hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
lc->name, lc->used, lc->nr_elements,
lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
}
static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
{
return lc->lc_slot + (enr % lc->nr_elements);
}
/**
* lc_find - find element by label, if present in the hash table
* @lc: The lru_cache object
* @enr: element number
*
* Returns the pointer to an element, if the element with the requested
* "label" or element number is present in the hash table,
* or NULL if not found. Does not change the refcnt.
*/
struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
{
struct hlist_node *n;
struct lc_element *e;
BUG_ON(!lc);
BUG_ON(!lc->nr_elements);
hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
if (e->lc_number == enr)
return e;
}
return NULL;
}
/* returned element will be "recycled" immediately */
static struct lc_element *lc_evict(struct lru_cache *lc)
{
struct list_head *n;
struct lc_element *e;
if (list_empty(&lc->lru))
return NULL;
n = lc->lru.prev;
e = list_entry(n, struct lc_element, list);
PARANOIA_LC_ELEMENT(lc, e);
list_del(&e->list);
hlist_del(&e->colision);
return e;
}
/**
* lc_del - removes an element from the cache
* @lc: The lru_cache object
* @e: The element to remove
*
* @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list,
* sets @e->enr to %LC_FREE.
*/
void lc_del(struct lru_cache *lc, struct lc_element *e)
{
PARANOIA_ENTRY();
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt);
e->lc_number = LC_FREE;
hlist_del_init(&e->colision);
list_move(&e->list, &lc->free);
RETURN();
}
static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
{
struct list_head *n;
if (list_empty(&lc->free))
return lc_evict(lc);
n = lc->free.next;
list_del(n);
return list_entry(n, struct lc_element, list);
}
static int lc_unused_element_available(struct lru_cache *lc)
{
if (!list_empty(&lc->free))
return 1; /* something on the free list */
if (!list_empty(&lc->lru))
return 1; /* something to evict */
return 0;
}
/**
* lc_get - get element by label, maybe change the active set
* @lc: the lru cache to operate on
* @enr: the label to look up
*
* Finds an element in the cache, increases its usage count,
* "touches" and returns it.
*
* In case the requested number is not present, it needs to be added to the
* cache. Therefore it is possible that an other element becomes evicted from
* the cache. In either case, the user is notified so he is able to e.g. keep
* a persistent log of the cache changes, and therefore the objects in use.
*
* Return values:
* NULL
* The cache was marked %LC_STARVING,
* or the requested label was not in the active set
* and a changing transaction is still pending (@lc was marked %LC_DIRTY).
* Or no unused or free element could be recycled (@lc will be marked as
* %LC_STARVING, blocking further lc_get() operations).
*
* pointer to the element with the REQUESTED element number.
* In this case, it can be used right away
*
* pointer to an UNUSED element with some different element number,
* where that different number may also be %LC_FREE.
*
* In this case, the cache is marked %LC_DIRTY (blocking further changes),
* and the returned element pointer is removed from the lru list and
* hash collision chains. The user now should do whatever housekeeping
* is necessary.
* Then he must call lc_changed(lc,element_pointer), to finish
* the change.
*
* NOTE: The user needs to check the lc_number on EACH use, so he recognizes
* any cache set change.
*/
struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e;
PARANOIA_ENTRY();
if (lc->flags & LC_STARVING) {
++lc->starving;
RETURN(NULL);
}
e = lc_find(lc, enr);
if (e) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
list_move(&e->list, &lc->in_use); /* Not evictable... */
RETURN(e);
}
++lc->misses;
/* In case there is nothing available and we can not kick out
* the LRU element, we have to wait ...
*/
if (!lc_unused_element_available(lc)) {
__set_bit(__LC_STARVING, &lc->flags);
RETURN(NULL);
}
/* it was not present in the active set.
* we are going to recycle an unused (or even "free") element.
* user may need to commit a transaction to record that change.
* we serialize on flags & TF_DIRTY */
if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
++lc->dirty;
RETURN(NULL);
}
e = lc_get_unused_element(lc);
BUG_ON(!e);
clear_bit(__LC_STARVING, &lc->flags);
BUG_ON(++e->refcnt != 1);
lc->used++;
lc->changing_element = e;
lc->new_number = enr;
RETURN(e);
}
/* similar to lc_get,
* but only gets a new reference on an existing element.
* you either get the requested element, or NULL.
* will be consolidated into one function.
*/
struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
{
struct lc_element *e;
PARANOIA_ENTRY();
if (lc->flags & LC_STARVING) {
++lc->starving;
RETURN(NULL);
}
e = lc_find(lc, enr);
if (e) {
++lc->hits;
if (e->refcnt++ == 0)
lc->used++;
list_move(&e->list, &lc->in_use); /* Not evictable... */
}
RETURN(e);
}
/**
* lc_changed - tell @lc that the change has been recorded
* @lc: the lru cache to operate on
* @e: the element pending label change
*/
void lc_changed(struct lru_cache *lc, struct lc_element *e)
{
PARANOIA_ENTRY();
BUG_ON(e != lc->changing_element);
PARANOIA_LC_ELEMENT(lc, e);
++lc->changed;
e->lc_number = lc->new_number;
list_add(&e->list, &lc->in_use);
hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
lc->changing_element = NULL;
lc->new_number = LC_FREE;
clear_bit(__LC_DIRTY, &lc->flags);
smp_mb__after_clear_bit();
RETURN();
}
/**
* lc_put - give up refcnt of @e
* @lc: the lru cache to operate on
* @e: the element to put
*
* If refcnt reaches zero, the element is moved to the lru list,
* and a %LC_STARVING (if set) is cleared.
* Returns the new (post-decrement) refcnt.
*/
unsigned int lc_put(struct lru_cache *lc, struct lc_element *e)
{
PARANOIA_ENTRY();
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt == 0);
BUG_ON(e == lc->changing_element);
if (--e->refcnt == 0) {
/* move it to the front of LRU. */
list_move(&e->list, &lc->lru);
lc->used--;
clear_bit(__LC_STARVING, &lc->flags);
smp_mb__after_clear_bit();
}
RETURN(e->refcnt);
}
/**
* lc_element_by_index
* @lc: the lru cache to operate on
* @i: the index of the element to return
*/
struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
{
BUG_ON(i >= lc->nr_elements);
BUG_ON(lc->lc_element[i] == NULL);
BUG_ON(lc->lc_element[i]->lc_index != i);
return lc->lc_element[i];
}
/**
* lc_index_of
* @lc: the lru cache to operate on
* @e: the element to query for its index position in lc->element
*/
unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
{
PARANOIA_LC_ELEMENT(lc, e);
return e->lc_index;
}
/**
* lc_set - associate index with label
* @lc: the lru cache to operate on
* @enr: the label to set
* @index: the element index to associate label with.
*
* Used to initialize the active set to some previously recorded state.
*/
void lc_set(struct lru_cache *lc, unsigned int enr, int index)
{
struct lc_element *e;
if (index < 0 || index >= lc->nr_elements)
return;
e = lc_element_by_index(lc, index);
e->lc_number = enr;
hlist_del_init(&e->colision);
hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
}
/**
* lc_dump - Dump a complete LRU cache to seq in textual form.
* @lc: the lru cache to operate on
* @seq: the &struct seq_file pointer to seq_printf into
* @utext: user supplied "heading" or other info
* @detail: function pointer the user may provide to dump further details
* of the object the lc_element is embedded in.
*/
void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext,
void (*detail) (struct seq_file *, struct lc_element *))
{
unsigned int nr_elements = lc->nr_elements;
struct lc_element *e;
int i;
seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext);
for (i = 0; i < nr_elements; i++) {
e = lc_element_by_index(lc, i);
if (e->lc_number == LC_FREE) {
seq_printf(seq, "\t%2d: FREE\n", i);
} else {
seq_printf(seq, "\t%2d: %4u %4u ", i,
e->lc_number, e->refcnt);
detail(seq, e);
}
}
}
EXPORT_SYMBOL(lc_create);
EXPORT_SYMBOL(lc_reset);
EXPORT_SYMBOL(lc_destroy);
EXPORT_SYMBOL(lc_set);
EXPORT_SYMBOL(lc_del);
EXPORT_SYMBOL(lc_try_get);
EXPORT_SYMBOL(lc_find);
EXPORT_SYMBOL(lc_get);
EXPORT_SYMBOL(lc_put);
EXPORT_SYMBOL(lc_changed);
EXPORT_SYMBOL(lc_element_by_index);
EXPORT_SYMBOL(lc_index_of);
EXPORT_SYMBOL(lc_seq_printf_stats);
EXPORT_SYMBOL(lc_seq_dump_details);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment